Skip to content
Snippets Groups Projects
Commit a8cfdec0 authored by Lea Kyveli Chrysanthopoulou's avatar Lea Kyveli Chrysanthopoulou
Browse files

Add modal verb distribution

parent b9aad95f
No related branches found
No related tags found
No related merge requests found
......@@ -10,8 +10,6 @@ import pandas as pd
import statistics
import re
# you'll have to also download "punkt" from nltk
# create function for bar (value) labels
def addlabels(x,y):
for i in range(len(x)):
......@@ -26,7 +24,7 @@ def read_works_into_string(directory_path):
strings = []
works = os.listdir(directory_path)
for work in works:
with open(f"{directory_path}"+f"/{work}", "r") as f:
with open(f"{directory_path}"+f"/{work}", "r", errors='ignore') as f: #ignores mostly unicode errors due to problematic encoding of text files
strings.append(f.read())
return "\n".join(strings)
......@@ -91,6 +89,27 @@ def tokenize_and_clean_text(text):
short_clean_tokens.append(''.join(map(str, dehyphenated_token)))
return short_clean_tokens
def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
if most_common_limit == False:
freq_dist = FreqDist(list_of_items)
else:
freq_dist = FreqDist(list_of_items).most_common(most_common_limit)
# convert FreqDist object to a pandas series for easier processing
dist_panda = pd.Series(dict(freq_dist))
# sort, normalise and round the panda series
new_dist = dist_panda.sort_index()
for i in range(0, len(new_dist.index)):
#for index in new_token_len_dist.index:
new_dist.iat[i] = round(new_dist.iat[i]/len(list_of_items), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
return new_dist
def mendenhall_token_metrics(tokens):
# create the distribution of token lengths / Mendenhall curve
......@@ -101,23 +120,8 @@ def mendenhall_token_metrics(tokens):
trim_percent = 0.005
trim_len = int(len(token_lengths) * trim_percent / 2)
token_lengths = sorted(token_lengths)[trim_len:-trim_len]
new_token_len_dist = calculate_freq_dist_as_clean_panda(token_lengths, most_common_limit=15) # token len freq dist
token_length_distribution = FreqDist(token_lengths).most_common(15)
# convert to FreqDist object to a pandas series for easier processing
token_len_dist_panda = pd.Series(dict(token_length_distribution))
# sort, normalise and round the panda series
new_token_len_dist = token_len_dist_panda.sort_index()
for i in range(0, len(new_token_len_dist.index)):
#for index in new_token_len_dist.index:
new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(tokens), 3) #index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
standard_deviation = statistics.stdev(token_lengths)
mean = statistics.mean(token_lengths)
......@@ -157,6 +161,7 @@ def pos_tag_freq(tokens):
punctuation_regex = r"[^\w\s]+"
summarised_tags = []
punctuation_tags = []
modal_verbs = []
index = 0
for token, tag in tag_token_tuples:
if re.match(punctuation_regex, token):
......@@ -170,7 +175,10 @@ def pos_tag_freq(tokens):
punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
else:
if tag in ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
if tag in ["MD"]:
summarised_tags.append("modal verb")
modal_verbs.append(token.lower())
elif tag in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
summarised_tags.append("verb")
elif tag in ["JJ", "JJR", "JJS"]:
summarised_tags.append("adjective")
......@@ -192,33 +200,16 @@ def pos_tag_freq(tokens):
summarised_tags.append("other_tag")
index += 1
tag_freq_dist = FreqDist(summarised_tags)
# convert FreqDist object to a pandas series for easier processing
tag_freq_dist_panda = pd.Series(dict(tag_freq_dist))
# sort, normalise and round the panda series
new_tag_freq_dist = tag_freq_dist_panda.sort_index()
for i in range(0, len(new_tag_freq_dist.index)):
#for index in new_token_len_dist.index:
new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not
#pos tag freq dist
new_tag_freq_dist = calculate_freq_dist_as_clean_panda(summarised_tags)
#punctuation frequency distribution
punct_tag_freq_dist = FreqDist(punctuation_tags)
new_punct_tag_freq_dist = calculate_freq_dist_as_clean_panda(punctuation_tags)
# convert FreqDist object to a pandas series for easier processing
punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist))
# sort, normalise and round the panda series
new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index()
for i in range(0, len(new_punct_tag_freq_dist.index)):
#for index in new_token_len_dist.index:
new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not
# modal verbs in more detail
new_md_freq_dist_panda = calculate_freq_dist_as_clean_panda(modal_verbs, most_common_limit=10)
return new_tag_freq_dist, new_punct_tag_freq_dist
return new_tag_freq_dist, new_punct_tag_freq_dist, new_md_freq_dist_panda
#f"throne_of_glass/data/canon_works"
def extract_info_from_directory_path(directory_path):
......@@ -234,39 +225,9 @@ def extract_info_from_directory_path(directory_path):
std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png")
mean_tokens.append(mean_tk)
class StylometryMetrics:
def calculate_sent_len_dist(text):
def __init__(self, directory_path, name_of_work, quality="", fanfiction=True):
self.text = read_works_into_string(directory_path)
self.clean_tokens = tokenize_and_clean_text(self.text)
self.name = name_of_work
self.fanfiction = fanfiction
self.quality = quality # good medium bad
def calculate_standardised_ttr(self):
self.sttr = standardised_type_token_ratio(self.clean_tokens)
def calculate_mendenhall_token_metrics(self):
self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens)
def plot_token_metrics(self, file_path_for_pic):
plt_title = self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + " Token Frequency Distribution"
plot_distribution(x=self.tk_len_dist, y=self.tk_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Token Length", y_label="Percentage of Occurence")
def calculate_pos_tag_distribution(self):
self.tag_freq_dist, self.punct_tag_freq_dist = pos_tag_freq(self.clean_tokens)
def plot_pos_tag_freq(self, file_path_for_pic):
plt_title = "POS Tag Frequencies for the " + self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon"
plot_distribution(x=self.tag_freq_dist, y=self.tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="POS Tags", y_label="Percentage of Occurence")
def plot_punct_freq(self, file_path_for_pic):
plt_title = "Punctuation Frequencies for the " + self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon"
plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence")
def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
sents = sent_tokenize(corpus)
sents = sent_tokenize(text)
sent_lens = []
for sent in sents:
short_clean_tokens = tokenize_and_clean_text(sent)
......@@ -279,76 +240,68 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
trim_percent = 0.05
trim_len = int(len(sent_lens) * trim_percent / 2)
sent_lens = sorted(sent_lens)[trim_len:-trim_len]
sent_len_dist = FreqDist(sent_lens)
#print(sent_len_dist)
# convert to FreqDist object to a pandas series for easier processing
sent_len_dist_panda = pd.Series(dict(sent_len_dist))
# sort, normalise and round the panda series
sent_len_dist = calculate_freq_dist_as_clean_panda(sent_lens) #new_sent_len_dist
new_sent_len_dist = sent_len_dist_panda.sort_index()
#print(new_sent_len_dist)
# plot the 25 most frequent sentence lenghts as a barplot for a more detailed insight
sent_len_dist_short = calculate_freq_dist_as_clean_panda(sent_lens, most_common_limit=25)
for i in range(0, len(new_sent_len_dist.index)):
#for index in new_token_len_dist.index:
new_sent_len_dist.iat[i] = round(new_sent_len_dist.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not
#print(new_sent_len_dist)
# plot using matplotlib and seaborn
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
#addlabels(x=new_sent_len_dist.index, y=new_sent_len_dist.values)
# calculate the standard deviation, mean
standard_deviation_sent = statistics.stdev(sent_lens)
mean_sent = statistics.mean(sent_lens)
plt.title(curve_title)
ax.set_xlabel("Sentence Length")
ax.set_ylabel("Percentage of Occurence")
sns.lineplot(x=new_sent_len_dist.index, y=new_sent_len_dist.values, ax=ax, palette="crest")
#plt.xticks(rotation=30) !!! very useful for words
plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_sent_len_long.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
return sent_len_dist, sent_len_dist_short, standard_deviation_sent, mean_sent
# plot the 40 most frequent sentence lenghts as a barplot for a more detailed insight
sent_len_dist_short = FreqDist(sent_lens).most_common(25)
class StylometryMetrics:
# convert to FreqDist object to a pandas series for easier processing
sent_len_dist_short_panda = pd.Series(dict(sent_len_dist_short))
def __init__(self, directory_path, name_of_work, quality="", fanfiction=True):
self.text = read_works_into_string(directory_path)
self.clean_tokens = tokenize_and_clean_text(self.text)
self.name = name_of_work
self.fanfiction = fanfiction
self.quality = quality # good medium bad
# sort, normalise and round the panda series
def determine_titles(self, plot_topic):
if self.fanfiction:
plt_title = f"{plot_topic} for the {self.name} {self.quality} Fanfiction"
else:
plt_title = f"{plot_topic} for the {self.name} Canon"
return plt_title
new_sent_len_dist_short = sent_len_dist_short_panda.sort_index()
#print(new_sent_len_dist)
def calculate_standardised_ttr(self):
self.sttr = standardised_type_token_ratio(self.clean_tokens)
for i in range(0, len(new_sent_len_dist_short.index)):
#for index in new_token_len_dist.index:
new_sent_len_dist_short.iat[i] = round(new_sent_len_dist_short.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
def calculate_mendenhall_token_metrics(self):
self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens)
# call function for bar (value) labels
addlabels(x=new_sent_len_dist_short.index, y=new_sent_len_dist_short.values)
def plot_token_metrics(self, file_path_for_pic):
plt_title = self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + " Token Frequency Distribution"
plot_distribution(x=self.tk_len_dist, y=self.tk_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Token Length", y_label="Percentage of Occurence")
plt.title(curve_title)
ax.set_xlabel("Sentence Length")
ax.set_ylabel("Percentage of Occurence")
def calculate_pos_tag_distribution(self):
self.tag_freq_dist, self.punct_tag_freq_dist, self.md_freq_dist = pos_tag_freq(self.clean_tokens)
sns.barplot(x=new_sent_len_dist_short.index, y=new_sent_len_dist_short.values, ax=ax, palette="YlGnBu")
#plt.xticks(rotation=30) !!! very useful for words
plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_sent_len_short.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
def calculate_sent_len_distribution(self):
self.sent_len_dist, self.sent_len_dist_short, self.sent_std_dev, self.sent_mean = calculate_sent_len_dist(self.text)
# calculate the standard deviation, mean, token/type ratio
standard_deviation_sent = statistics.stdev(sent_lens)
mean_sent = statistics.mean(sent_lens)
def plot_long_sent_len_dist(self, file_path_for_pic):
plt_title = self.determine_titles(plot_topic="Full Sentence Length Distribution")
plot_distribution(x=self.sent_len_dist, y=self.sent_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Sentence Lengths", y_label="Percentage of Occurence", plt_type="lineplot")
def plot_short_sent_len_dist(self, file_path_for_pic):
plt_title = self.determine_titles(plot_topic="Short Sentence Length Distribution")
plot_distribution(x=self.sent_len_dist_short, y=self.sent_len_dist_short, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Sentence Lengths", y_label="Percentage of Occurence")
return standard_deviation_sent, mean_sent
def plot_pos_tag_freq(self, file_path_for_pic):
plt_title = self.determine_titles(plot_topic="POS Tag Frequencies")
plot_distribution(x=self.tag_freq_dist, y=self.tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="POS Tags", y_label="Percentage of Occurence")
def plot_md_freq(self, file_path_for_pic):
plt_title = self.determine_titles(plot_topic="Modal Verb Frequencies")
plot_distribution(x=self.md_freq_dist, y=self.md_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Modal Verbs", y_label="Percentage of Occurence")
def plot_punct_freq(self, file_path_for_pic):
plt_title = self.determine_titles(plot_topic="Punctuation Frequencies")
plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence")
# overall pos_tag frequency distribution
# pos_tag ngrams; (maybe exclude stopwords?)
......@@ -358,7 +311,6 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
# most frequent adjectives
#create the Mendenhall Curve for the Throne of Glass Series
#std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
......@@ -410,35 +362,39 @@ def run_functions(directory_path):
#grishaverse/data/split_txt_fanfics
#create lists for each of the columns of the dataframe we'll create
mean_tokens = [mean_tokens_tog_canon, mean_tokens_grishaverse_canon]
std_dev_tokens = [std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon]
type_token_ratio = [type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]
mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon]
std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]
index = ["throne_of_glass_canon", "grishaverse_canon"]
# create a dataframe to store all the overview statistics in
# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15;
# mean_sent; std_dev_sent; freq_sent_len ....
# tag_frequencies
# tag_ngram_frequencies
# punctuation frequencies
# token/type ratio
data_overview = pd.DataFrame(
{"mean_tokens":mean_tokens,
"std_dev_tokens":std_dev_tokens,
"type_token_ratio":type_token_ratio,
"mean_sent":mean_sent,
"std_dev_sent":std_dev_sents},
index = index
)
def create_dataframe_with_overview_info():
#create lists for each of the columns of the dataframe we'll create
mean_tokens = [mean_tokens_tog_canon, mean_tokens_grishaverse_canon]
std_dev_tokens = [std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon]
type_token_ratio = [type_token_ratio_tog_canon, type_token_ratio_grishaverse_canon]
mean_sent = [mean_sent_tog_canon, mean_sent_grishaverse_canon]
std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon]
index = ["throne_of_glass_canon", "grishaverse_canon"]
# create a dataframe to store all the overview statistics in
# columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15;
# mean_sent; std_dev_sent; freq_sent_len ....
# tag_frequencies
# tag_ngram_frequencies
# punctuation frequencies
# token/type ratio
data_overview = pd.DataFrame(
{"mean_tokens":mean_tokens,
"std_dev_tokens":std_dev_tokens,
"type_token_ratio":type_token_ratio,
"mean_sent":mean_sent,
"std_dev_sent":std_dev_sents},
index = index
)
if __name__ == "__main__":
#run_functions("grishaverse/data/split_txt_fanfics")
#run_functions("throne_of_glass/data/split_txt_fanfics")
#data_overview.to_csv(f"data_overview/data_overview.csv")
\ No newline at end of file
#data_overview.to_csv(f"data_overview/data_overview.csv")
GrishaverseCanon = StylometryMetrics(directory_path="grishaverse/data/canon_works", name_of_work="Grishaverse", fanfiction=False)
GrishaverseCanon.calculate_pos_tag_distribution()
GrishaverseCanon.plot_md_freq("grishaverse/plots/canon/md_freq.png")
\ No newline at end of file
grishaverse/plots/canon/md_freq.png

43.6 KiB

grishaverse/plots/canon/pos_tag_freq.png

55.9 KiB

alki ist blöd
\ No newline at end of file
me is good for filling stuffs
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment