Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
F
fanfiction and stylometry clean
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
chrysanthopoulou
fanfiction and stylometry clean
Commits
8a31a9ff
Commit
8a31a9ff
authored
1 year ago
by
chrysanthopoulou
Browse files
Options
Downloads
Patches
Plain Diff
Modify the separation some more
parent
428c3c1f
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
clean_stylometry.py
+1
-1
1 addition, 1 deletion
clean_stylometry.py
fanfic_preprocessing.py
+33
-19
33 additions, 19 deletions
fanfic_preprocessing.py
singular_fanfics_stylometry.py
+21
-18
21 additions, 18 deletions
singular_fanfics_stylometry.py
stylo_sing.py
+5
-2
5 additions, 2 deletions
stylo_sing.py
with
60 additions
and
40 deletions
clean_stylometry.py
+
1
−
1
View file @
8a31a9ff
...
...
@@ -104,7 +104,7 @@ def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
for
i
in
range
(
0
,
len
(
new_dist
.
index
)):
#for index in new_token_len_dist.index:
new_dist
.
iat
[
i
]
=
round
(
new_dist
.
iat
[
i
]
/
len
(
list_of_items
),
3
)
#index-1 bc the index starts counting from zero, the word lengths not
new_dist
.
iat
[
i
]
=
round
(
float
(
new_dist
.
iat
[
i
]
)
/
len
(
list_of_items
),
3
)
#index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
...
...
This diff is collapsed.
Click to expand it.
fanfic_preprocessing.py
+
33
−
19
View file @
8a31a9ff
...
...
@@ -90,13 +90,13 @@ def separate_fanfics_by_good_medium_bad(df, series, few_kudos_thres, medium_kudo
if
os
.
path
.
exists
(
f
"
{
series
}
/data/split_txt_fanfics/medium_fics
"
)
==
False
:
os
.
makedirs
(
f
"
{
series
}
/data/split_txt_fanfics/medium_fics
"
)
with
open
(
f
"
{
series
}
/data/split_txt_fanfics/good_fics.txt
"
,
"
w
"
)
as
f
:
with
open
(
f
"
{
series
}
/data/split_txt_fanfics/good_fics
/good_fics
.txt
"
,
"
w
"
)
as
f
:
f
.
write
(
good_fics_joined
)
with
open
(
f
"
{
series
}
/data/split_txt_fanfics/bad_fics.txt
"
,
"
w
"
)
as
f
:
with
open
(
f
"
{
series
}
/data/split_txt_fanfics/bad_fics
/bad_fics
.txt
"
,
"
w
"
)
as
f
:
f
.
write
(
bad_fics_joined
)
with
open
(
f
"
{
series
}
/data/split_txt_fanfics/medium_fics.txt
"
,
"
w
"
)
as
f
:
with
open
(
f
"
{
series
}
/data/split_txt_fanfics/medium_fics
/medium_fics
.txt
"
,
"
w
"
)
as
f
:
f
.
write
(
medium_fics_joined
)
def
clean_fanfic_dataset
(
file_path
):
...
...
@@ -156,6 +156,7 @@ def run_functions(file_paths):
if
__name__
==
"
__main__
"
:
"""
#clean_fanfic_dataset(
"
cosmere/data/fanfics/cosmere_fanfics.csv
"
)
...
...
@@ -186,27 +187,40 @@ if __name__ == "__main__":
"""
call_me_by_your_name
=
pd
.
read_csv
(
"
call_me_by_your_name/data/fanfics/call_me_by_your_name_fanfics_new.csv
"
)
separate_fanfics_by_good_medium_bad
(
call_me_by_your_name
,
"
call_me_by_your_name
"
,
69
,
201
)
#I select the thresholds to have a 50% 30% 20% split
# numbers underneath for 50% 35% 15% 69 276
cosmere
=
pd
.
read_csv
(
"
cosmere/data/fanfics/cosmere_fanfics_new.csv
"
)
separate_fanfics_by_good_medium_bad(cosmere,
"
cosmere
"
,
36, 73) # I select the thresholds to have a 50% 25% 25% split
separate_fanfics_by_good_medium_bad
(
cosmere
,
"
cosmere
"
,
40
,
88
)
# 40 108
divergent
=
pd
.
read_csv
(
"
divergent/data/fanfics/divergent_fanfics_new.csv
"
)
separate_fanfics_by_good_medium_bad(divergent,
"
divergent
"
, 3
2
,
70
)
separate_fanfics_by_good_medium_bad
(
divergent
,
"
divergent
"
,
3
3
,
94
)
# 33 119
grishaverse
=
pd
.
read_csv
(
"
grishaverse/data/fanfics/grisha_fanfics_new.csv
"
)
separate_fanfics_by_good_medium_bad(grishaverse,
"
grishaverse
"
, 13
1
,
284
)
separate_fanfics_by_good_medium_bad
(
grishaverse
,
"
grishaverse
"
,
13
4
,
346
)
# 134 440
maze_runner
=
pd
.
read_csv
(
"
maze_runner/data/fanfics/mazerunner_fanfics_new.csv
"
)
separate_fanfics_by_good_medium_bad(maze_runner,
"
maze_runner
"
, 8
4
,
188
)
separate_fanfics_by_good_medium_bad
(
maze_runner
,
"
maze_runner
"
,
8
9
,
239
)
# 89 299
murderbot
=
pd
.
read_csv
(
"
murderbot/data/fanfics/murderbot_fanfics_new.csv
"
)
separate_fanfics_by_good_medium_bad(murderbot,
"
murderbot
"
,
63, 143
)
separate_fanfics_by_good_medium_bad
(
murderbot
,
"
murderbot
"
,
95
,
209
)
# 95 239
percy
=
pd
.
read_csv
(
"
percy/data/fanfics/percy_fanfics_new.csv
"
)
separate_fanfics_by_good_medium_bad(percy,
"
percy
"
, 94, 233)
separate_fanfics_by_good_medium_bad
(
percy
,
"
percy
"
,
99
,
291
)
# 99 343
red_white_royal_blue
=
pd
.
read_csv
(
"
red_white_royal_blue/data/fanfics/red_white_royal_blue_fanfics_new.csv
"
)
separate_fanfics_by_good_medium_bad
(
red_white_royal_blue
,
"
red_white_royal_blue
"
,
280
,
666
)
# 280 884
school_for_good_and_evil
=
pd
.
read_csv
(
"
school_for_good_and_evil/data/fanfics/school_fanfics_new.csv
"
)
separate_fanfics_by_good_medium_bad(school_for_good_and_evil,
"
school_for_good_and_evil
"
, 63, 143)
separate_fanfics_by_good_medium_bad
(
school_for_good_and_evil
,
"
school_for_good_and_evil
"
,
63
,
169
)
# 63 198
simonverse
=
pd
.
read_csv
(
"
simonverse/data/fanfics/simonverse_fanfics_new.csv
"
)
separate_fanfics_by_good_medium_bad
(
simonverse
,
"
simonverse
"
,
181
,
347
)
# 181 376
song_of_achilles
=
pd
.
read_csv
(
"
song_of_achilles/data/fanfics/song_of_achilles_fanfics_new.csv
"
)
separate_fanfics_by_good_medium_bad
(
song_of_achilles
,
"
song_of_achilles
"
,
122
,
285
)
# 122 329
throne_of_glass
=
pd
.
read_csv
(
"
throne_of_glass/data/fanfics/throne_of_glass_fanfics_new.csv
"
)
separate_fanfics_by_good_medium_bad(throne_of_glass,
"
throne_of_glass
"
, 56, 109)
"""
\ No newline at end of file
separate_fanfics_by_good_medium_bad
(
throne_of_glass
,
"
throne_of_glass
"
,
56
,
131
)
# 56 153
#"""
\ No newline at end of file
This diff is collapsed.
Click to expand it.
singular_fanfics_stylometry.py
+
21
−
18
View file @
8a31a9ff
...
...
@@ -87,7 +87,7 @@ def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
for
i
in
range
(
0
,
len
(
new_dist
.
index
)):
#for index in new_token_len_dist.index:
new_dist
.
iat
[
i
]
=
round
(
new_dist
.
iat
[
i
]
/
len
(
list_of_items
),
3
)
#index-1 bc the index starts counting from zero, the word lengths not
new_dist
.
iat
[
i
]
=
round
(
float
(
new_dist
.
iat
[
i
]
)
/
len
(
list_of_items
),
3
)
#index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
...
...
@@ -109,14 +109,14 @@ def mendenhall_token_metrics(tokens):
token_lengths
=
sorted
(
token_lengths
)[
trim_len
:
-
trim_len
]
new_token_len_dist
=
calculate_freq_dist_as_clean_panda
(
token_lengths
,
most_common_limit
=
15
)
# token len freq dist
try
:
standard_deviatio
n
=
statistics
.
stdev
(
token_lengths
)
mean
=
statistics
.
mean
(
token_lengths
)
standard_deviation
=
statistics
.
stdev
(
token_lengths
)
mea
n
=
statistics
.
mean
(
token_lengths
)
"""
except:
print(
"
too short not enough tokens
"
)
standard_deviation = np.nan
mean = np.nan
"""
return
new_token_len_dist
,
standard_deviation
,
mean
...
...
@@ -221,14 +221,15 @@ def calculate_sent_len_dist(text):
sent_len_dist_short
=
calculate_freq_dist_as_clean_panda
(
sent_lens
,
most_common_limit
=
25
)
# calculate the standard deviation, mean
try
:
standard_deviation_sent
=
statistics
.
stdev
(
sent_lens
)
mean_sent
=
statistics
.
mean
(
sent_lens
)
standard_deviation_sent
=
statistics
.
stdev
(
sent_lens
)
mean_sent
=
statistics
.
mean
(
sent_lens
)
"""
except:
print(
"
too short not enough sents
"
)
standard_deviation_sent = np.nan
mean_sent = np.nan
"""
return
sent_len_dist
,
sent_len_dist_short
,
standard_deviation_sent
,
mean_sent
...
...
@@ -272,8 +273,9 @@ def execute_funcs(file_paths, ExampleClass, save_distributions_to_csv=False, dat
mean_sent
=
[]
std_dev_sents
=
[]
nums_of_kudos
=
[]
not_first
=
0
not_first
=
0
for
file_path
in
file_paths
:
print
(
file_path
)
#"cosmere/data/fanfics/cosmere_fanfics_new.csv"
fanfic_df
=
pd
.
read_csv
(
file_path
,
thousands
=
"
,
"
)
# get series from directory path
...
...
@@ -290,13 +292,15 @@ def execute_funcs(file_paths, ExampleClass, save_distributions_to_csv=False, dat
num_of_kudos
=
pd
.
to_numeric
(
row
[
"
kudos
"
],
errors
=
'
coerce
'
)
text
=
row
[
"
body
"
]
C
=
ExampleClass
(
text
=
text
,
num_of_kudos
=
num_of_kudos
)
if
len
(
C
.
clean_tokens
)
<=
10
:
try
:
# Calculate Stuff
C
.
calculate_mendenhall_token_metrics
()
C
.
calculate_pos_tag_distribution
()
C
.
calculate_sent_len_distribution
()
C
.
calculate_punct_distribution
()
except
:
print
(
f
"
faulty datapoint
{
index
}
\n
{
C
.
clean_tokens
}
"
)
continue
# Calculate Stuff
C
.
calculate_mendenhall_token_metrics
()
C
.
calculate_pos_tag_distribution
()
C
.
calculate_sent_len_distribution
()
C
.
calculate_punct_distribution
()
# data overview csv
if
data_overview_csv
:
mean_tokens
.
append
(
C
.
tk_len_mean
)
...
...
@@ -366,15 +370,14 @@ if __name__ == "__main__":
file_path_list
=
[
'
call_me_by_your_name/data
'
,
'
cosmere/data
'
,
'
divergent/data
'
,
'
grishaverse/data
'
,
'
maze_runner/data
'
,
'
murderbot/data
'
,
'
percy/data
'
,
'
red_white_royal_blue/data
'
,
'
school_for_good_and_evil/data
'
,
'
simonverse/data
'
,
'
song_of_achilles/data
'
,
'
throne_of_glass/data
'
,]
#file_path_list = ['song_of_achilles/data', 'throne_of_glass/data', 'simonverse/data', ]
file_paths_lists
=
[
os
.
listdir
(
f
"
{
file_path
}
/fanfics
"
)
for
file_path
in
file_path_list
]
file_paths
=
[]
for
file_path
in
file_paths_lists
:
#print(*file_path_list)
file_paths
.
append
(
*
file_path
)
file_paths_new
=
[
f
"
{
folder
}
/fanfics/
{
file
}
"
for
folder
,
file
in
zip
(
file_path_list
,
file_paths
)]
execute_funcs
(
file_paths
=
file_paths_new
,
ExampleClass
=
StylometryMetrics
,
save_distributions_to_csv
=
True
,
data_overview_csv
=
True
)
This diff is collapsed.
Click to expand it.
stylo_sing.py
+
5
−
2
View file @
8a31a9ff
...
...
@@ -109,7 +109,7 @@ def calculate_freq_dist_as_clean_panda(list_of_items, most_common_limit=False):
for
i
in
range
(
0
,
len
(
new_dist
.
index
)):
#for index in new_token_len_dist.index:
new_dist
.
iat
[
i
]
=
round
(
new_dist
.
iat
[
i
]
/
len
(
list_of_items
),
3
)
#index-1 bc the index starts counting from zero, the word lengths not
new_dist
.
iat
[
i
]
=
round
(
float
(
new_dist
.
iat
[
i
]
)
/
len
(
list_of_items
),
3
)
#index-1 bc the index starts counting from zero, the word lengths not
#if float(new_token_len_dist.iat[i]) == 0.00:
# new_token_len_dist.drop(index=i) # here it is used as the label, so we want the index, not index -1; bad work-around, I'm sorry
...
...
@@ -601,7 +601,10 @@ def execute_funcs(dir_paths, ExampleClass, plt_stuff=False, save_distributions_t
if
__name__
==
"
__main__
"
:
#dir_paths = ['cosmere/data', 'divergent/data', 'grishaverse/data', 'maze_runner/data', 'murderbot/data', 'percy/data', 'school_for_good_and_evil/data', 'throne_of_glass/data',]
dir_paths
=
[
'
trial_times
'
]
#dir_paths = ['trial_times']
dir_paths
=
[
'
call_me_by_your_name/data
'
,
'
cosmere/data
'
,
'
divergent/data
'
,
'
grishaverse/data
'
,
'
maze_runner/data
'
,
'
murderbot/data
'
,
'
percy/data
'
,
'
red_white_royal_blue/data
'
,
'
school_for_good_and_evil/data
'
,
'
simonverse/data
'
,
'
song_of_achilles/data
'
,
'
throne_of_glass/data
'
,]
#execute_funcs(dir_paths=dir_paths, ExampleClass=StylometryMetrics, plt_stuff=True, save_distributions_to_csv=True, data_overview_csv=True)
execute_funcs
(
dir_paths
=
dir_paths
,
ExampleClass
=
StylometryMetrics
,
plt_stuff
=
False
,
save_distributions_to_csv
=
True
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment