Skip to content
Snippets Groups Projects
Commit 15403abf authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Add mediocre punctuation workaround

parent 5d267e22
No related branches found
No related tags found
No related merge requests found
grishaverse/freq_distribution/canon_pos_tag_frequencies.png

52.7 KiB

grishaverse/freq_distribution/canon_punctuation_frequencies.png

33.2 KiB

grishaverse/freq_distribution/canon_sent_len_long.png

38.1 KiB | W: | H:

grishaverse/freq_distribution/canon_sent_len_long.png

37.1 KiB | W: | H:

grishaverse/freq_distribution/canon_sent_len_long.png
grishaverse/freq_distribution/canon_sent_len_long.png
grishaverse/freq_distribution/canon_sent_len_long.png
grishaverse/freq_distribution/canon_sent_len_long.png
  • 2-up
  • Swipe
  • Onion skin
grishaverse/freq_distribution/canon_sent_len_short.png

34.8 KiB | W: | H:

grishaverse/freq_distribution/canon_sent_len_short.png

33.8 KiB | W: | H:

grishaverse/freq_distribution/canon_sent_len_short.png
grishaverse/freq_distribution/canon_sent_len_short.png
grishaverse/freq_distribution/canon_sent_len_short.png
grishaverse/freq_distribution/canon_sent_len_short.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -5,6 +5,7 @@ import os
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag
import pandas as pd
import statistics
......@@ -77,14 +78,9 @@ def standardised_type_token_ratio(tokens):
return std_ttr
# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
# precise input: corpus = string ;
# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png"
def tokenize_and_clean_text(text):
def mendenhall_curve(corpus, curve_title, plot_destination):
tokens = word_tokenize(corpus)
tokens = word_tokenize(text)
cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
......@@ -116,8 +112,20 @@ def mendenhall_curve(corpus, curve_title, plot_destination):
short_clean_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
short_clean_tokens.append(''.join(map(str, dehyphenated_token)))
return short_clean_tokens
# this function takes a corpus as its input and gives a Mendenhall curve, i.e. a frequency distribution of tokens as its output
# precise input: corpus = string ;
# curve_title = string, the title of the plot that will be produced, e.g., "Mendenhall Curve for Throne of Glass Series"
# plot_destination = string, the (relative) path, including the file name and .png tag of the plot produced, e.g. f"throne_of_glass/freq_distribution/all_canon_token_len.png"
def mendenhall_curve(corpus, curve_title, plot_destination):
short_clean_tokens = tokenize_and_clean_text(corpus)
# create the distribution of token lengths / Mendenhall curve
token_lengths = [len(token) for token in short_clean_tokens]
......@@ -169,46 +177,11 @@ def mendenhall_curve(corpus, curve_title, plot_destination):
def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
sents = sent_tokenize(corpus)
sent_lens = []
for sent in sents:
#print(sent)
tokens = word_tokenize(sent)
#print(tokens)
cleaned_tokens = ([token for token in tokens if any(c.isalpha() for c in token)])
#print(cleaned_tokens)
short_clean_tokens = [] # when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
for token in cleaned_tokens:
dehyphenated_token = []
letter_present = 0
dehyphenated = 0
second_word_in_compound = 0
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
if dehyphenated == 1:
second_word_in_compound = 1
elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
dehyphenated = 1
second_word_in_compound = 0
if letter_present == 1 and dehyphenated == 0:
short_clean_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
short_clean_tokens.append(''.join(map(str, dehyphenated_token)))
#print(short_clean_tokens)
#print(len(short_clean_tokens))
short_clean_tokens = tokenize_and_clean_text(sent)
sent_lens.append(len(short_clean_tokens))
#if len(short_clean_tokens)>= 90:
#print(f"This sentence: \n {sent} \n is this long: {len(short_clean_tokens)}")
......@@ -261,7 +234,6 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
for i in range(0, len(new_sent_len_dist_short.index)):
#for index in new_token_len_dist.index:
new_sent_len_dist_short.iat[i] = round(new_sent_len_dist_short.iat[i]/len(sent_lens), 2) #index-1 bc the index starts counting from zero, the word lengths not
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
......@@ -284,20 +256,209 @@ def sentence_metrics(corpus, curve_title, series, canon_or_fanfic):
return standard_deviation_sent, mean_sent
def most_frequent_words(corpus, curve_title, series, canon_or_fanfic):
#hall
# overall pos_tag frequency distribution
# pos_tag ngrams; (maybe exclude stopwords?)
# tag collocates for specific tags --> adjectives most frequently with nouns
# most frequent words
# most frequent words for specific tags --> punctuation;
# most frequent adjectives
def pos_tag_frequencies(corpus, curve_title, series, canon_or_fanfic):
#nltk.pos_tag(text) --> [('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'),
#('completely', 'RB'), ('different', 'JJ')]
tokens = word_tokenize(corpus)
short_tokens = []
for token in tokens:
dehyphenated_token = []
letter_present = 0
dehyphenated = 0
second_word_in_compound = 0
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
if dehyphenated == 1:
second_word_in_compound = 1
elif c.isalpha() == False and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking
dehyphenated_token_joined = ''.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_tokens.append(dehyphenated_token_joined)
short_tokens.append(c) #append the hyphen/ other punctuation --> we're also interested in that
dehyphenated_token = []
letter_present = 0
dehyphenated = 1
second_word_in_compound = 0
if letter_present == 1 and dehyphenated == 0:
short_tokens.append(token) #catching the tokens that didn't have any special characters; but not the dehyphenated ones twice
elif letter_present == 1 and dehyphenated == 1 and second_word_in_compound == 1:
short_tokens.append(''.join(map(str, dehyphenated_token)))
tag_token_tuples = pos_tag(short_tokens)
"""
#coordinating_conjunction = [CC]
#subordinating_conjunction = []IN
#determiner = [] # tag: DT; PDT; WDT
#noun = [] # tags: NN, NNS
#proper_noun = [] # tags: NNP; NNPS
#pronoun = [] # tags: PRP, PRP$; WP; WP$
#adverb = [] # RB; RBR; RBS; WRB
#verb = [] # tags: MD; VB; VBD; VBG; VBN; VBP; VBZ
#adjective = [] #tags: JJ; JJR; JJS
other_tags = [] # tags: $; CD; EX; LS; POS; SYM; TO; UH; RP; FW
#punctuation
quotation_marks = [] # tags: '', ``
comma = [] # tags: ,
dash = [] # tags: --
sentence_terminator = [] # tag: .
parentheses = [] # tags: (; )
semicolon = [] # for tag : --> token: ;
ellipsis_punct = [] # for tag : --> token: ...
"""
punctuation_tags = []
summarised_tags = []
for tuple in tag_token_tuples:
if tuple[1] in ["MD" , "VB" , "VBD", "VBG" , "VBN" , "VBP" , "VBZ"]:
summarised_tags.append("verb")
elif tuple[1] in ["JJ" , "JJR" , "JJS"]:
summarised_tags.append("adjective")
elif tuple[1] in ["RB" , "RBR" , "RBS" , "WRB"]:
summarised_tags.append("adverb")
elif tuple[1] in ["PRP" , "PRP$" , "WP" , "WP$"]:
summarised_tags.append("pronoun")
elif tuple[1] in ["NNP" , "NNPS"]:
summarised_tags.append("proper_noun")
elif tuple[1] in ["NN" , "NNS"]:
summarised_tags.append("common_noun")
elif tuple[1] in [ "DT" , "PDT" , "WDT"]:
summarised_tags.append("determiner")
elif tuple[1] == "CC":
summarised_tags.append("coordinating_conj")
elif tuple[1] == "IN":
summarised_tags.append("subordinating_conj")
elif tuple[1] in ["$" , "CD" , "EX" , "LS" , "POS" , "SYM" , "TO" , "UH" , "RP" , "FW"]:
summarised_tags.append("other_tag")
# now comes the punctuation
elif tuple[1] in [ "''" , "``"]:
summarised_tags.append("punctuation")
punctuation_tags.append("quotation_marks")
elif tuple[1] == ",":
summarised_tags.append("punctuation")
punctuation_tags.append("comma")
elif tuple[1] == ".":
summarised_tags.append("punctuation")
punctuation_tags.append("sentence_terminator")
elif tuple[1] in ["(" , ")"]:
summarised_tags.append("punctuation")
punctuation_tags.append("parentheses")
elif tuple[1] == "--":
summarised_tags.append("punctuation")
punctuation_tags.append("dash")
elif tuple[1] == ":":
summarised_tags.append("punctuation")
if tuple[0] == ";":
punctuation_tags.append("semicolon")
elif tuple[0] == "...":
punctuation_tags.append("ellipsis")
elif tuple[0] == ":":
punctuation_tags.append("colon")
else:
summarised_tags.append(tuple[1])
tag_freq_dist = FreqDist(summarised_tags)
#print(tag_freq_dist)
# convert FreqDist object to a pandas series for easier processing
tag_freq_dist_panda = pd.Series(dict(tag_freq_dist))
print(tag_freq_dist_panda)
# sort, normalise and round the panda series
new_tag_freq_dist = tag_freq_dist_panda.sort_index()
#print(new_sent_len_dist)
for i in range(0, len(new_tag_freq_dist.index)):
#for index in new_token_len_dist.index:
new_tag_freq_dist.iat[i] = round(new_tag_freq_dist.iat[i]/len(tag_token_tuples), 2) #index-1 bc the index starts counting from zero, the word lengths not
print(new_tag_freq_dist)
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
addlabels(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values)
plt.title(curve_title)
ax.set_xlabel("POS Tags")
ax.set_ylabel("Percentage of Occurence")
sns.barplot(x=new_tag_freq_dist.index, y=new_tag_freq_dist.values, ax=ax, palette="flare")
plt.xticks(rotation=30) # !!! very useful for words
plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_pos_tag_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
#punctuation frequency distribution
punct_tag_freq_dist = FreqDist(punctuation_tags)
#print(tag_freq_dist)
# convert FreqDist object to a pandas series for easier processing
punct_tag_freq_dist_panda = pd.Series(dict(punct_tag_freq_dist))
print(punct_tag_freq_dist_panda)
# sort, normalise and round the panda series
new_punct_tag_freq_dist = punct_tag_freq_dist_panda.sort_index()
#print(new_sent_len_dist)
for i in range(0, len(new_punct_tag_freq_dist.index)):
#for index in new_token_len_dist.index:
new_punct_tag_freq_dist.iat[i] = round(new_punct_tag_freq_dist.iat[i]/len(punctuation_tags), 2) #index-1 bc the index starts counting from zero, the word lengths not
print(new_punct_tag_freq_dist)
# set figure, ax into variables
fig, ax = plt.subplots(figsize=(10,10))
# call function for bar (value) labels
addlabels(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values)
plt.title(curve_title)
ax.set_xlabel("Types of Punctuation")
ax.set_ylabel("Percentage of Occurence")
sns.barplot(x=new_punct_tag_freq_dist.index, y=new_punct_tag_freq_dist.values, ax=ax, palette="flare")
plt.xticks(rotation=30) # !!! very useful for words
plt.savefig(f"{series}/freq_distribution/{canon_or_fanfic}_punctuation_frequencies.png") # "throne_of_glass/freq_distribution/all_canon_sent_len.png"
#create the Mendenhall Curve for the Throne of Glass Series
std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
#std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png")
#create the Mendenhall Curve for the Grishaverse Books
std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
#std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png")
# Mendenhall Curve Sentence Lengths for Throne of Glass Canon
std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon")
#std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon")
# Mendenhall Curve Sentence Lenghts for Grishavers Canon
std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon")
#std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon")
# POS Tag frequencies for TOG
#pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "POS Tag Frequencies for the Throne of Glass Series", "throne_of_glass", "canon")
# POS Tag frequencies for Grishaverse
pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "POS Tag Frequencies for the Grishaverse Books", "grishaverse", "canon")
# create a dataframe to store all the overview statistics in
......@@ -307,7 +468,7 @@ std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(r
# tag_ngram_frequencies
# punctuation frequencies
# token/type ratio
"""
data_overview = pd.DataFrame(
{"mean_tokens":[mean_tokens_tog_canon, mean_tokens_grishaverse_canon],
"std_dev":[std_dev_tokens_tog_canon, std_dev_tokens_grishaverse_canon],
......@@ -317,4 +478,6 @@ data_overview = pd.DataFrame(
index= ["throne_of_glass_canon", "grishaverse_canon"]
)
data_overview.to_csv(f"data_overview/data_overview.csv")
\ No newline at end of file
data_overview.to_csv(f"data_overview/data_overview.csv")
"""
\ No newline at end of file
throne_of_glass/freq_distribution/canon_pos_tag_frequencies.png

67.9 KiB

throne_of_glass/freq_distribution/canon_punctuation_frequencies.png

33.8 KiB

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment