diff --git a/__pycache__/stylometry_code.cpython-37.pyc b/__pycache__/stylometry_code.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..166f45afb5b6a66943764ec88f18c66b57e5ba22 Binary files /dev/null and b/__pycache__/stylometry_code.cpython-37.pyc differ diff --git a/__pycache__/type_token_demo.cpython-37.pyc b/__pycache__/type_token_demo.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11c0dfbac40a4e3f8722cc0d4b9e542bcf7786d9 Binary files /dev/null and b/__pycache__/type_token_demo.cpython-37.pyc differ diff --git a/demo_text.txt b/demo_text.txt new file mode 100644 index 0000000000000000000000000000000000000000..045cc5e09112de33c89b1fab7a45c29c255a738a --- /dev/null +++ b/demo_text.txt @@ -0,0 +1,53 @@ +R. BENNET was among the earliest of those who waited on Mr. Bingley. He had always intended to visit him, though to the last always assuring his wife that he should not go; and till the evening after the visit was paid she had no knowledge of it. It was then disclosed in the following manner. Observing his second daughter employed in trimming a hat, he suddenly addressed her with,— + +“I hope Mr. Bingley will like it, Lizzy.†+ +“We are not in a way to know what Mr. Bingley likes,†said her mother, resentfully, “since we are not to visit.{7}†+ +“But you forget, mamma,†said Elizabeth, “that we shall meet him at the assemblies, and that Mrs. Long has promised to introduce him.†+ +“I do not believe Mrs. Long will do any such thing. She has two nieces of her own. She is a selfish, hypocritical woman, and I have no opinion of her.†+ +“No more have I,†said Mr. Bennet; “and I am glad to find that you do not depend on her serving you.†+ +Mrs. Bennet deigned not to make any reply; but, unable to contain herself, began scolding one of her daughters. + +“Don’t keep coughing so, Kitty, for heaven’s sake! Have a little compassion on my nerves. You tear them to pieces.†+ +“Kitty has no discretion in her coughs,†said her father; “she times them ill.†+ +“I do not cough for my own amusement,†replied Kitty, fretfully. “When is your next ball to be, Lizzy?†+ +“To-morrow fortnight.†+ +“Ay, so it is,†cried her mother, “and Mrs. Long does not come back till the day before; so, it will be impossible for her to introduce him, for she will not know him herself.†+ +“Then, my dear, you may have the advantage of your friend, and introduce Mr. Bingley to her.†+ +“Impossible, Mr. Bennet, impossible, when I am not acquainted with him myself; how can you be so teasing?†+ +“I honour your circumspection. A fortnight’s acquaintance is certainly very little. One cannot know what a man really is by the end of a fortnight. But if we do not venture, somebody else will; and after all, Mrs. Long and her nieces must stand their chance; and, therefore,{8} as she will think it an act of kindness, if you decline the office, I will take it on myself.†+ +The girls stared at their father. Mrs. Bennet said only, “Nonsense, nonsense!†+ +“What can be the meaning of that emphatic exclamation?†cried he. “Do you consider the forms of introduction, and the stress that is laid on them, as nonsense? I cannot quite agree with you there. What say you, Mary? For you are a young lady of deep reflection, I know, and read great books, and make extracts.†+ +Mary wished to say something very sensible, but knew not how. + +“While Mary is adjusting her ideas,†he continued, “let us return to Mr. Bingley.†+ +“I am sick of Mr. Bingley,†cried his wife. + +“I am sorry to hear that; but why did you not tell me so before? If I had known as much this morning, I certainly would not have called on him. It is very unlucky; but as I have actually paid the visit, we cannot escape the acquaintance now.†+ +The astonishment of the ladies was just what he wished—that of Mrs. Bennet perhaps surpassing the rest; though when the first tumult of joy was over, she began to declare that it was what she had expected all the while. + +“How good it was in you, my dear Mr. Bennet! But I knew I should persuade you at last. I was sure you loved your girls too well to neglect such an acquaintance. Well, how pleased I am! And it is such a good joke, too, that you should have gone this morning, and never said a word about it till now.†+ +“Now, Kitty, you may cough as much as you choose,†said Mr. Bennet; and, as he spoke, he left the room, fatigued with the raptures of his wife.{9} + +“What an excellent father you have, girls,†said she, when the door was shut. “I do not know how you will ever make him amends for his kindness; or me either, for that matter. At our time of life, it is not so pleasant, I can tell you, to be making new acquaintances every day; but for your sakes we would do anything. Lydia, my love, though you are the youngest, I dare say Mr. Bingley will dance with you at the next ball.†+ +“Oh,†said Lydia, stoutly, “I am not afraid; for though I am the youngest, I’m the tallest.†+ +The rest of the evening was spent in conjecturing how soon he would return Mr. Bennet’s visit, and determining when they should ask him to dinner. diff --git a/stylometry_code.py b/stylometry_code.py index 6afec30a9aa461ffe3511b8b73a80a92d367a29c..47b8769bdd46d9bade4587eda67e5d76e168672d 100644 --- a/stylometry_code.py +++ b/stylometry_code.py @@ -422,23 +422,23 @@ def pos_tag_frequencies(corpus, series, canon_or_fanfic): #create the Mendenhall Curve for the Throne of Glass Series -std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") +#std_dev_tokens_tog_canon, mean_tokens_tog_canon, type_token_ratio_tog_canon = mendenhall_curve(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for the Throne of Glass Series", f"throne_of_glass/freq_distribution/all_canon_token_len.png") #create the Mendenhall Curve for the Grishaverse Books -std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png") +#std_dev_tokens_grishaverse_canon, mean_tokens_grishaverse_canon, type_token_ratio_grishaverse_canon = mendenhall_curve(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for the Grishaverse Books", f"grishaverse/freq_distribution/all_canon_token_len.png") # Mendenhall Curve Sentence Lengths for Throne of Glass Canon -std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon") +#std_dev_sent_tog_canon, mean_sent_tog_canon = sentence_metrics(read_works_into_string(f"throne_of_glass/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Throne of Glass Series", "throne_of_glass", "canon") # Mendenhall Curve Sentence Lenghts for Grishavers Canon -std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon") +#std_dev_sent_grishaverse_canon, mean_sent_grishaverse_canon = sentence_metrics(read_works_into_string(f"grishaverse/data/canon_works"), "Mendenhall Curve for Sentence Lenghts for the Grishaverse Books", "grishaverse", "canon") # POS Tag frequencies for TOG -pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon") +#pos_tag_frequencies(read_works_into_string(f"throne_of_glass/data/canon_works"), "throne_of_glass", "canon") # POS Tag frequencies for Grishaverse -pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon") +#pos_tag_frequencies(read_works_into_string(f"grishaverse/data/canon_works"), "grishaverse", "canon") def run_functions(directory_path): """ @@ -483,8 +483,8 @@ std_dev_sents = [std_dev_sent_tog_canon, std_dev_sent_grishaverse_canon] index = ["throne_of_glass_canon", "grishaverse_canon"] -run_functions("grishaverse/data/split_txt_fanfics") -run_functions("throne_of_glass/data/split_txt_fanfics") +#run_functions("grishaverse/data/split_txt_fanfics") +#run_functions("throne_of_glass/data/split_txt_fanfics") # create a dataframe to store all the overview statistics in # columns mean_tokens; std_dev_tokens; freq_token_len_1; ...; freq_token_len_15; diff --git a/type_token_demo.py b/type_token_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..56ae41c3b5075e6b6f92e5a54da9c1aa96dd1b8a --- /dev/null +++ b/type_token_demo.py @@ -0,0 +1,52 @@ +from nltk.tokenize import word_tokenize +from nltk.probability import FreqDist +from nltk.tokenize import sent_tokenize +from nltk.tag import pos_tag +import pandas as pd +import statistics +import re +from nltk.stem import WordNetLemmatizer +import nltk + +lemmatizer = WordNetLemmatizer() +def get_types_tokens_for_lexemes(text): + lemmatizer = WordNetLemmatizer() + tokens = word_tokenize(text) + lemmatized_tokens = [] + for token in tokens: + lem_token = lemmatizer.lemmatize(token) + lemmatized_tokens.append(lem_token) + types = set(lemmatized_tokens) + print(f"For a type token analysis on a lexeme level, this is the number of types: {len(types)} \n and this is the number of tokens: {len(lemmatized_tokens)} \n") + print(f"These are the types: \n {types} \n") + print(f"and these are the tokens: \n {lemmatized_tokens} \n") + +def get_types_tokens_for_word_forms(text): + tokens = word_tokenize(text) + types = set(tokens) + print(f"For a type token analysis on a word form level, this is the number of types: {len(types)} \n and this is the number of tokens: {len(tokens)} \n") + print(f"These are the types: \n {types} \n") + print(f"and these are the tokens: \n {tokens}") + +def get_types_tokens_for_part_of_speech(text): + tokens = word_tokenize(text) + tagged_tokens = pos_tag(tokens) + tags_as_tokens = [] + for tagged_token in tagged_tokens: + tag = tagged_token[1] + tags_as_tokens.append(tag) + tags_as_types = set(tags_as_tokens) + print(f"For a type token analysis on a part of speech level, this is the number of types: {len(tags_as_types)} \n and this is the number of tokens: {len(tags_as_tokens)} \n") + print(f"These are the types: \n {tags_as_types} \n") + print(f"and these are the tokens: \n {tags_as_tokens} \n") + +with open(f"demo_text.txt") as text: + text = text.read() + get_types_tokens_for_lexemes(text) + get_types_tokens_for_part_of_speech(text) + get_types_tokens_for_word_forms(text) + + + + +