Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
F
fanfiction and stylometry clean
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
chrysanthopoulou
fanfiction and stylometry clean
Commits
8f71a165
Commit
8f71a165
authored
1 year ago
by
chrysanthopoulou
Browse files
Options
Downloads
Patches
Plain Diff
Remove the hyphens to unskew the Mendenhall curve
parent
1617381c
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
stylometry_code.py
+59
-1
59 additions, 1 deletion
stylometry_code.py
throne_of_glass/freq_distribution/all_canon_token_len.png
+0
-0
0 additions, 0 deletions
throne_of_glass/freq_distribution/all_canon_token_len.png
with
59 additions
and
1 deletion
stylometry_code.py
+
59
−
1
View file @
8f71a165
...
...
@@ -43,16 +43,74 @@ def read_works_into_string(directory_path):
tokens
=
word_tokenize
(
read_works_into_string
(
"
throne_of_glass/data/canon_works
"
))
cleaned_tokens
=
([
token
for
token
in
tokens
if
any
(
c
.
isalpha
()
for
c
in
token
)])
short_clean_tokens
=
[]
# when looking at the results, there were some strange token lengths, because somewhere in the data conversion hyphens
# had been added in the wrong places. I had the tokens with very large lengths printed and they had this format, e.g. "everywhere—assassin"
# and where counted, in this instance as 19 characters long but up to 45 characters long: "walking-as-fast-as-they-could-without-running"
"""
for token in cleaned_tokens:
dehyphenated_token = []
letter_present = 0
if len(token) >= 19:
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
#print(dehyphenated_token)
elif c.isalpha() == False and (c ==
"
-
"
or c ==
"
—
"
) and letter_present == 1: #here I am eliminating both dashes and hyphens,
#bc the hyphens are used both correctly and incorrectly and it skews my distribution a lot
#print(dehyphenated_token)
dehyphenated_token_joined =
''
.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
elif len(token) >= 14:
for c in token:
if c.isalpha() == True:
dehyphenated_token.append(c)
letter_present = 1
#print(dehyphenated_token)
elif c ==
"
—
"
and letter_present == 1: #here I am eliminating only dashes
"
territory—thanks
"
but keeping hyphenated
# words as one
"
cobbled-together
"
#print(dehyphenated_token)
dehyphenated_token_joined =
''
.join(map(str, dehyphenated_token))
#print(dehyphenated_token_joined)
short_clean_tokens.append(dehyphenated_token_joined)
dehyphenated_token = []
letter_present = 0
else:
short_clean_tokens.append(token)
"""
for
token
in
cleaned_tokens
:
dehyphenated_token
=
[]
letter_present
=
0
for
c
in
token
:
if
c
.
isalpha
()
==
True
:
dehyphenated_token
.
append
(
c
)
letter_present
=
1
elif
c
.
isalpha
()
==
False
and
letter_present
==
1
:
#here I am eliminating both dashes and hyphens,
#bc it skews the word metric if red-blue is counted as a 9 character token, boosting the count of
# high-character tokens significantly. all texts will be preprocessed the same way, so it shouldn't make a difference,
# relatively speaking
dehyphenated_token_joined
=
''
.
join
(
map
(
str
,
dehyphenated_token
))
#print(dehyphenated_token_joined)
short_clean_tokens
.
append
(
dehyphenated_token_joined
)
dehyphenated_token
=
[]
letter_present
=
0
# distribution of token lengths / Mendenhall curve
token_lengths
=
[
len
(
token
)
for
token
in
clean
ed
_tokens
]
token_lengths
=
[
len
(
token
)
for
token
in
short_
clean_tokens
]
token_length_distribution
=
FreqDist
(
token_lengths
)
print
(
token_length_distribution
.
tabulate
())
token_length_freq_dist_plot
=
token_length_distribution
.
plot
(
title
=
"
Token Length Frequency Distribution: Throne of Glass Series
"
,
percents
=
True
)
fig_freq_dist
=
token_length_freq_dist_plot
.
get_figure
()
fig_freq_dist
.
savefig
(
"
throne_of_glass/freq_distribution/all_canon_token_len.png
"
)
for
token
in
short_clean_tokens
:
if
len
(
token
)
>=
14
:
print
(
f
"
this is the word:
{
token
}
and it
'
s this long
{
len
(
token
)
}
"
)
#print(read_works_into_string("throne_of_glass/data/canon_works"))
# transform corpus into a list of tokens
\ No newline at end of file
This diff is collapsed.
Click to expand it.
throne_of_glass/freq_distribution/all_canon_token_len.png
+
0
−
0
View replaced file @
1617381c
View file @
8f71a165
35.3 KiB
|
W:
|
H:
33 KiB
|
W:
|
H:
2-up
Swipe
Onion skin
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment