Skip to content
Snippets Groups Projects
Commit b9aad95f authored by Lea Kyveli Chrysanthopoulou's avatar Lea Kyveli Chrysanthopoulou
Browse files

Clean up POS Tags

parent d1028a35
No related branches found
No related tags found
No related merge requests found
...@@ -123,7 +123,7 @@ def mendenhall_token_metrics(tokens): ...@@ -123,7 +123,7 @@ def mendenhall_token_metrics(tokens):
return new_token_len_dist, standard_deviation, mean return new_token_len_dist, standard_deviation, mean
def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence",palette="flare", plt_type="barplot", add_labels=True): def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of Kudos", y_label="Percentage of Occurence",palette="flare", plt_type="barplot", add_labels=True, rotate_ticks=True):
plt.figure(figsize=(10,10)) plt.figure(figsize=(10,10))
plt.title(plt_title) plt.title(plt_title)
...@@ -133,45 +133,115 @@ def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of ...@@ -133,45 +133,115 @@ def plot_distribution(x, y, plt_title, file_path_for_pic:str, x_label="Number of
if add_labels: if add_labels:
addlabels(x=x.index, y=y.values) addlabels(x=x.index, y=y.values)
match case: match plt_type:
sns.scatterplot(x=x.index, y=y.values, palette=palette) case "scatterplot":
#plt.xticks(new_dist.index[::100], new_dist.index[::100]) sns.scatterplot(x=x.index, y=y.values, palette=palette)
case "lineplot":
else: sns.lineplot(x=x.index, y=y.values, palette=palette)
sns.lineplot(x=x.index, y=y.values, palette=palette) case "barplot":
sns.barplot(x=x.index, y=y.values, palette=palette)
case "histplot":
sns.histplot(x=x.index, y=y.values, palette=palette)
case _:
print(f"{plt_type} is not a valid format for this function")
if rotate_ticks:
plt.xticks(rotation=30) # !!! very useful for words
plt.savefig(file_path_for_pic) plt.savefig(file_path_for_pic)
plt.close() plt.close()
def pos_tag_freq(tokens):
#nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
#('completely', 'RB'), ('different', 'JJ')]
tag_token_tuples = pos_tag(tokens)
punctuation_regex = r"[^\w\s]+"
summarised_tags = []
punctuation_tags = []
index = 0
for token, tag in tag_token_tuples:
if re.match(punctuation_regex, token):
summarised_tags.append("punctuation")
if re.match(r"[\"\'“”’‘]+", token):
punctuation_tags.append("quotation_marks")
elif re.match(r"[,;:.?!-]+", token):
try:
punctuation_tags.append("ellipsis" if token == "." and tag_token_tuples[index+1][1] == "." and tag_token_tuples[index+2][1] == "." else "full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
except:
punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
# plot using matplotlib and seaborn else:
if tag in ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
# set figure, ax into variables summarised_tags.append("verb")
fig, ax = plt.subplots(figsize=(10,10)) elif tag in ["JJ", "JJR", "JJS"]:
summarised_tags.append("adjective")
# call function for bar (value) labels elif tag in ["RB", "RBR", "RBS", "WRB"]:
addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values) summarised_tags.append("adverb")
elif tag in ["PRP", "PRP$", "WP", "WP$"]:
plt.title(curve_title) summarised_tags.append("pronoun")
ax.set_xlabel("Word Length") elif tag in ["NNP", "NNPS"]:
ax.set_ylabel("Percentage of Occurence") summarised_tags.append("proper_noun")
elif tag in ["NN", "NNS"]:
summarised_tags.append("common_noun")
elif tag in ["DT", "PDT", "WDT"]:
summarised_tags.append("determiner")
elif tag == "CC":
summarised_tags.append("coordinating_conj")
elif tag == "IN":
summarised_tags.append("subordinating_conj")
elif tag in ["$", "CD", "EX", "LS", "POS", "SYM", "TO", "UH", "RP", "FW"]:
summarised_tags.append("other_tag")
index += 1
sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare")
#plt.xticks(rotation=30) !!! very useful for words tag_freq_dist = FreqDist(summarised_tags)
#plt.get_figure()
plt.savefig(plot_destination) # convert FreqDist object to a pandas series for easier processing
#print(new_token_len_dist.tabulate()) tag_freq_dist_panda = pd.Series(dict(tag_freq_dist))
#token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True)
# sort, normalise and round the panda series
#fig_freq_dist = token_length_freq_dist_plot.get_figure() new_tag_freq_dist = tag_freq_dist_panda.sort_index()
#fig_freq_dist.savefig(plot_destination)
for i in range(0, len(new_tag_freq_dist.index)):
#for index in new_token_len_dist.index:
new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not
#punctuation frequency distribution
punct_tag_freq_dist = FreqDist(punctuation_tags)
# convert FreqDist object to a pandas series for easier processing
punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist))
# sort, normalise and round the panda series
new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index()
for i in range(0, len(new_punct_tag_freq_dist.index)):
#for index in new_token_len_dist.index:
new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not
return new_tag_freq_dist, new_punct_tag_freq_dist
#f"throne_of_glass/data/canon_works"
def extract_info_from_directory_path(directory_path):
#for txt_fic in os.listdir(directory_path):
works = os.listdir(directory_path)
pattern = r"^[a-zA-Z_]+(?=/)" # get series from directory path
match = re.search(pattern, directory_path)
if match:
series = match.group(0)
for work in works:
with open(f"{directory_path}"+f"/{work}", "r") as f:
f = f.read()
std_dev_tk, mean_tk, ttr = mendenhall_curve(f, f"Mendenhall Curve for the {series.replace('_' , ' ').title()} {work[:-4].replace('_' , ' ').title()}", f"{series}/freq_distribution/{work[:-4]}_token_len.png")
mean_tokens.append(mean_tk)
class StylometryMetrics: class StylometryMetrics:
def __init__(self, directory_path): def __init__(self, directory_path, name_of_work, quality="", fanfiction=True):
self.text = read_works_into_string(directory_path) self.text = read_works_into_string(directory_path)
self.clean_tokens = tokenize_and_clean_text(self.text) self.clean_tokens = tokenize_and_clean_text(self.text)
self.name = name_of_work
self.fanfiction = fanfiction
self.quality = quality # good medium bad
def calculate_standardised_ttr(self): def calculate_standardised_ttr(self):
self.sttr = standardised_type_token_ratio(self.clean_tokens) self.sttr = standardised_type_token_ratio(self.clean_tokens)
...@@ -179,84 +249,20 @@ class StylometryMetrics: ...@@ -179,84 +249,20 @@ class StylometryMetrics:
def calculate_mendenhall_token_metrics(self): def calculate_mendenhall_token_metrics(self):
self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens) self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens)
def plot def plot_token_metrics(self, file_path_for_pic):
plt_title = self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon" + " Token Frequency Distribution"
plot_distribution(x=self.tk_len_dist, y=self.tk_len_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Token Length", y_label="Percentage of Occurence")
# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
# precise input: corpus = string ;
# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png"
def mendenhall_curve(corpus, curve_title, plot_destination):
short_clean_tokens = tokenize_and_clean_text(corpus)
# create the distribution of token lengths / Mendenhall curve
token_lengths = [len(token) for token in short_clean_tokens]
# Calculate the trimmed token length (with 5% trimming) We need to remove the outliers, bc even despite preprocessing,
# there still are some very wrong lengths, which entirely skews the metrics and also ruins our p-values later on
trim_percent = 0.005
trim_len = int(len(token_lengths) * trim_percent / 2)
token_lengths = sorted(token_lengths)[trim_len:-trim_len]
token_length_distribution = FreqDist(token_lengths).most_common(15)
# convert to FreqDist object to a pandas series for easier processing
token_len_dist_panda = pd.Series(dict(token_length_distribution))
# sort, normalise and round the panda series
new_token_len_dist = token_len_dist_panda.sort_index() def calculate_pos_tag_distribution(self):
self.tag_freq_dist, self.punct_tag_freq_dist = pos_tag_freq(self.clean_tokens)
for i in range(0, len(new_token_len_dist.index)): def plot_pos_tag_freq(self, file_path_for_pic):
#for index in new_token_len_dist.index: plt_title = "POS Tag Frequencies for the " + self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon"
new_token_len_dist.iat[i] = round(new_token_len_dist.iat[i]/len(short_clean_tokens), 3) #index-1 bc the index starts counting from zero, the word lengths not plot_distribution(x=self.tag_freq_dist, y=self.tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="POS Tags", y_label="Percentage of Occurence")
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
# plot using matplotlib and seaborn def plot_punct_freq(self, file_path_for_pic):
plt_title = "Punctuation Frequencies for the " + self.name + " " + (self.quality + " ") if self.fanfiction else "" + "Fanfiction" if self.fanfiction else " Canon"
# set figure, ax into variables plot_distribution(x=self.punct_tag_freq_dist, y=self.punct_tag_freq_dist, plt_title=plt_title, file_path_for_pic=file_path_for_pic, x_label="Types of Punctuation", y_label="Percentage of Occurence")
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
addlabels(x=new_token_len_dist.index, y=new_token_len_dist.values)
plt.title(curve_title)
ax.set_xlabel("Word Length")
ax.set_ylabel("Percentage of Occurence")
sns.barplot(x=new_token_len_dist.index, y=new_token_len_dist.values, ax=ax, palette="flare")
#plt.xticks(rotation=30) !!! very useful for words
#plt.get_figure()
plt.savefig(plot_destination)
#print(new_token_len_dist.tabulate())
#token_length_freq_dist_plot = token_length_distribution.plot(title=curve_title, percents=True)
#fig_freq_dist = token_length_freq_dist_plot.get_figure()
#fig_freq_dist.savefig(plot_destination)
# calculate the standard deviation, mean, token/type ratio
standard_deviation = statistics.stdev(token_lengths)
mean = statistics.mean(token_lengths)
type_token_ratio = standardised_type_token_ratio(short_clean_tokens)
return standard_deviation, mean, type_token_ratio
def sentence_metrics(corpus, curve_title, series, canon_or_fanfic): def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
...@@ -351,167 +357,26 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic): ...@@ -351,167 +357,26 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
# most frequent words for specific tags --> punctuation; # most frequent words for specific tags --> punctuation;
# most frequent adjectives # most frequent adjectives
def pos_tag_frequencies(corpus, series, canon_or_fanfic):
#nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
#('completely', 'RB'), ('different', 'JJ')]
tokens = word_tokenize(corpus)
"""
short_tokens = []
for token in tokens:
dehyphenated_token = []
letter_present = 0
dehyphenated = 0
second_word_in_compound = 0
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
if dehyphenated == 1:
second_word_in_compound = 1
elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_tokens.append(dehyphenated_token_joined)
short_tokens.append(c) #append the hyphen/ other punctuation --> we're also interested in that
dehyphenated_token = []
letter_present = 0
dehyphenated = 1
second_word_in_compound = 0
if letter_present == 1 and dehyphenated == 0:
short_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
short_tokens.append(''.join(map(str, dehyphenated_token)))
"""
tag_token_tuples = pos_tag(tokens)
punctuation_regex = r"[^\w\s]+"
summarised_tags = []
punctuation_tags = []
index = 0
for token, tag in tag_token_tuples:
if re.match(punctuation_regex, token):
summarised_tags.append("punctuation")
if re.match(r"[\"\'“”’‘]+", token):
punctuation_tags.append("quotation_marks")
elif re.match(r"[,;:.?!-]+", token):
try:
punctuation_tags.append("ellipsis" if token == "." and tag_token_tuples[index+1][1] == "." and tag_token_tuples[index+2][1] == "." else "full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
except:
punctuation_tags.append("full_stop" if token == "." else "question_mark" if token == "?" else "exclamation_mark" if token == "!" else "comma" if token == "," else "semicolon" if token == ";" else "dash" if token == "-" else "other_punct")
else:
if tag in ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]:
summarised_tags.append("verb")
elif tag in ["JJ", "JJR", "JJS"]:
summarised_tags.append("adjective")
elif tag in ["RB", "RBR", "RBS", "WRB"]:
summarised_tags.append("adverb")
elif tag in ["PRP", "PRP$", "WP", "WP$"]:
summarised_tags.append("pronoun")
elif tag in ["NNP", "NNPS"]:
summarised_tags.append("proper_noun")
elif tag in ["NN", "NNS"]:
summarised_tags.append("common_noun")
elif tag in ["DT", "PDT", "WDT"]:
summarised_tags.append("determiner")
elif tag == "CC":
summarised_tags.append("coordinating_conj")
elif tag == "IN":
summarised_tags.append("subordinating_conj")
elif tag in ["$", "CD", "EX", "LS", "POS", "SYM", "TO", "UH", "RP", "FW"]:
summarised_tags.append("other_tag")
index += 1
tag_freq_dist = FreqDist(summarised_tags)
#print(tag_freq_dist)
# convert FreqDist object to a pandas series for easier processing
tag_freq_dist_panda = pd.Series(dict(tag_freq_dist))
#print(tag_freq_dist_panda)
# sort, normalise and round the panda series
new_tag_freq_dist = tag_freq_dist_panda.sort_index()
#print(new_sent_len_dist)
for i in range(0, len(new_tag_freq_dist.index)):
#for index in new_token_len_dist.index:
new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not
print(new_tag_freq_dist)
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
addlabels(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values)
plt.title(f"POS Tag Frequencies for the {series.replace('_' , ' ').title()} {canon_or_fanfic.replace('_' , ' ').title()}")
ax.set_xlabel("POS Tags")
ax.set_ylabel("Percentage of Occurence")
sns.barplot(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values, ax=ax, palette="RdPu")
plt.xticks(rotation=30) # !!! very useful for words
plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_pos_tag_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
#punctuation frequency distribution
punct_tag_freq_dist = FreqDist(punctuation_tags)
#print(tag_freq_dist)
# convert FreqDist object to a pandas series for easier processing
punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist))
#print(punct_tag_freq_dist_panda)
# sort, normalise and round the panda series
new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index()
#print(new_sent_len_dist)
for i in range(0, len(new_punct_tag_freq_dist.index)):
#for index in new_token_len_dist.index:
new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 3) #index-1 bc the index starts counting from zero, the word lengths not
#print(new_punct_tag_freq_dist)
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
addlabels(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values)
plt.title(f"Punctuation Frequencies for the {series.replace('_' , ' ').title()} {canon_or_fanfic.replace('_' , ' ').title()}")
ax.set_xlabel("Types of Punctuation")
ax.set_ylabel("Percentage of Occurence")
sns.barplot(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values, ax=ax, palette="OrRd")
plt.xticks(rotation=30) # !!! very useful for words
plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_punctuation_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
#create the Mendenhall Curve for the Throne of Glass Series #create the Mendenhall Curve for the Throne of Glass Series
std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") #std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
#create the Mendenhall Curve for the Grishaverse Books #create the Mendenhall Curve for the Grishaverse Books
std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png") #std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
# Mendenhall Curve Sentence Lengths for Throne of Glass Canon # Mendenhall Curve Sentence Lengths for Throne of Glass Canon
std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon") #std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon")
# Mendenhall Curve Sentence Lenghts for Grishavers Canon # Mendenhall Curve Sentence Lenghts for Grishavers Canon
std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon") #std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon")
# POS Tag frequencies for TOG # POS Tag frequencies for TOG
pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon") #pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon")
# POS Tag frequencies for Grishaverse # POS Tag frequencies for Grishaverse
pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon") #pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon")
def run_functions(directory_path): def run_functions(directory_path):
""" """
...@@ -574,6 +439,6 @@ data_overview = pd.DataFrame( ...@@ -574,6 +439,6 @@ data_overview = pd.DataFrame(
if __name__ == "__main__": if __name__ == "__main__":
run_functions("grishaverse/data/split_txt_fanfics") #run_functions("grishaverse/data/split_txt_fanfics")
run_functions("throne_of_glass/data/split_txt_fanfics") #run_functions("throne_of_glass/data/split_txt_fanfics")
data_overview.to_csv(f"data_overview/data_overview.csv") #data_overview.to_csv(f"data_overview/data_overview.csv")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment