Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
N
NLP_Evaluation
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
perov
NLP_Evaluation
Commits
acf7f897
Commit
acf7f897
authored
2 weeks ago
by
perov
Browse files
Options
Downloads
Patches
Plain Diff
add a script to asses the texts used in the survey automatically
parent
976f42ac
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/evaluate_automatic.py
+275
-0
275 additions, 0 deletions
src/evaluate_automatic.py
with
275 additions
and
0 deletions
src/evaluate_automatic.py
0 → 100644
+
275
−
0
View file @
acf7f897
import
automatic_metrics
as
am
from
pathlib
import
Path
import
re
import
copy
def
extract_marked_text
(
file_path
):
"""
Uses
"
X
"
as a marker to find which lines/texts to extract. Passages without a X are ignored.
"""
with
open
(
file_path
,
'
r
'
,
encoding
=
'
utf-8
'
)
as
file
:
lines
=
file
.
readlines
()
final_poems
=
{}
poems
=
[]
current_poem
=
[]
collecting
=
False
idx
=
0
for
line
in
lines
:
match
=
re
.
match
(
r
"
(\d+):.*\bX\s*$
"
,
line
.
strip
())
if
collecting
and
current_poem
:
if
current_poem
and
any
(
line
.
strip
()
for
line
in
current_poem
):
poems
.
append
(
"
\n
"
.
join
(
current_poem
).
strip
())
current_poem
=
[]
if
match
:
if
collecting
:
final_poems
[
idx
]
=
''
.
join
(
poems
)
idx
+=
1
poems
=
[]
collecting
=
True
elif
re
.
match
(
r
"
\d+:
"
,
line
.
strip
()):
if
collecting
:
final_poems
[
idx
]
=
''
.
join
(
poems
)
idx
+=
1
poems
=
[]
collecting
=
False
elif
collecting
:
current_poem
.
append
(
line
.
strip
())
if
collecting
and
current_poem
:
poems
.
append
(
"
\n
"
.
join
(
current_poem
).
strip
())
if
collecting
and
current_poem
:
final_poems
[
idx
]
=
''
.
join
(
poems
)
return
final_poems
def
get_all_data_from_folder
(
foldername
,
datatype
=
"
txt
"
):
"""
extracts all files from given folder for further processing
"""
script_dir
=
Path
(
__file__
).
resolve
().
parent
data_dir
=
script_dir
.
parent
/
f
"
{
foldername
}
"
files
=
list
(
data_dir
.
rglob
(
f
"
*.
{
datatype
}
"
))
all_extracted_text
=
{}
for
file
in
files
:
relativ_file_location
=
file
.
relative_to
(
data_dir
)
text
=
extract_marked_text
(
file
)
all_extracted_text
[
str
(
relativ_file_location
)]
=
text
return
all_extracted_text
def
calculate_scores_texts
(
text
):
"""
Calculates scores for given text
"""
texts
=
copy
.
deepcopy
(
text
)
evaluator
=
am
.
Compute_Metrics
()
evaluated_texts
=
{}
for
filename
in
texts
:
for
idx
in
texts
[
filename
]:
text
=
texts
[
filename
][
idx
]
calc_metrics
=
[]
calc_metrics
.
append
(
evaluator
.
compute_fre
(
text
))
calc_metrics
.
append
(
evaluator
.
compute_ttr
(
text
))
calc_metrics
.
append
(
evaluator
.
compute_pmi
(
text
))
calc_metrics
.
append
(
evaluator
.
compute_tfidf
(
text
))
evaluated_texts
[
f
"
{
filename
}
\\
{
idx
}
"
]
=
calc_metrics
return
evaluated_texts
# {filename\idx: [fre, ttr, pmi, tfidf]}
class
Calculate_Parameters
(
object
):
""""
automated procedure to calculate parameters
"""
def
__init__
(
self
,
metrics_ai
,
metrics_human
,
question_num
):
# FRE, TTR, PMI, and TF-IDF are at index 0, 1, 2, and 3
self
.
ai_fre
,
self
.
ai_ttr
,
self
.
ai_pmi
,
self
.
ai_tfidf
=
metrics_ai
[
0
],
metrics_ai
[
1
],
metrics_ai
[
2
],
metrics_ai
[
3
]
self
.
human_fre
,
self
.
human_ttr
,
self
.
human_pmi
,
self
.
human_tfidf
=
metrics_human
[
0
],
metrics_human
[
1
],
metrics_human
[
2
],
metrics_human
[
3
]
self
.
question_num
=
question_num
def
calculate_coherence
(
self
):
score
=
0
if
abs
(
self
.
ai_fre
-
self
.
human_fre
)
>=
20
:
if
self
.
ai_fre
>
self
.
human_fre
:
score
+=
1
else
:
score
-=
1
if
abs
(
self
.
ai_pmi
-
self
.
human_pmi
)
>=
0.8
:
if
self
.
ai_pmi
>
self
.
human_pmi
:
score
+=
2
else
:
score
-=
2
if
abs
(
self
.
ai_tfidf
-
self
.
human_tfidf
)
>=
0.2
:
if
self
.
ai_tfidf
>
self
.
human_tfidf
:
score
+=
1
elif
self
.
ai_tfidf
<
self
.
human_tfidf
:
score
-=
1
if
score
>
0
:
return
"
ai
"
if
score
<
0
:
return
"
human
"
if
score
==
0
:
return
"
equal
"
def
calculate_conciseness
(
self
):
score
=
0
if
abs
(
self
.
ai_pmi
-
self
.
human_pmi
)
>=
1
:
if
self
.
ai_pmi
>
self
.
human_pmi
:
score
+=
1
else
:
score
-=
1
if
abs
(
self
.
ai_ttr
-
self
.
human_ttr
)
>=
0.1
:
if
self
.
ai_ttr
<
self
.
human_ttr
:
score
+=
2
elif
self
.
ai_ttr
>
self
.
human_ttr
:
score
-=
2
if
score
>
0
:
return
"
ai
"
if
score
<
0
:
return
"
human
"
if
score
==
0
:
return
"
equal
"
def
calculate_creativity
(
self
):
score
=
0
if
abs
(
self
.
ai_pmi
-
self
.
human_pmi
)
>=
1
:
if
self
.
ai_pmi
<
self
.
human_pmi
:
score
+=
1
else
:
score
-=
1
if
abs
(
self
.
ai_ttr
-
self
.
human_ttr
)
>=
0.1
:
if
self
.
ai_ttr
>
self
.
human_ttr
:
score
+=
1
elif
self
.
ai_ttr
<
self
.
human_ttr
:
score
-=
1
if
abs
(
self
.
ai_fre
-
self
.
human_fre
)
>=
20
:
if
self
.
ai_fre
<
self
.
human_fre
:
score
+=
1
else
:
score
-=
1
if
score
>
0
:
return
"
ai
"
if
score
<
0
:
return
"
human
"
if
score
==
0
:
return
"
equal
"
def
calculate_clarity_of_concept
(
self
):
score
=
0
if
abs
(
self
.
ai_pmi
-
self
.
human_pmi
)
>=
1
:
if
self
.
ai_pmi
<
self
.
human_pmi
:
score
+=
1
else
:
score
-=
1
if
abs
(
self
.
ai_ttr
-
self
.
human_ttr
)
>=
0.1
:
if
self
.
ai_ttr
<
self
.
human_ttr
:
score
+=
1
else
:
score
-=
1
if
abs
(
self
.
ai_fre
-
self
.
human_fre
)
>=
20
:
if
self
.
ai_fre
<
self
.
human_fre
:
score
+=
1
else
:
score
-=
1
if
abs
(
self
.
ai_tfidf
-
self
.
human_tfidf
)
>=
0.2
:
if
self
.
ai_tfidf
>
self
.
human_tfidf
:
score
+=
1
elif
self
.
ai_tfidf
<
self
.
human_tfidf
:
score
-=
1
if
score
>
0
:
return
"
ai
"
if
score
<
0
:
return
"
human
"
if
score
==
0
:
return
"
equal
"
def
predict_human_ai
(
survey_assessment
):
"""
Counts the times when the human text had better scores on the parameters and count the times when the
ai text had better scores on the parameters.
The outputed tag is the predicted tag.
"
Equal
"
means it couldn
'
t decide.
"""
predicted_tags
=
{}
for
question_num
,
rated_param
in
survey_assessment
.
items
():
ai
=
0
human
=
0
keys_to_check
=
set
(
rated_param
.
keys
())
if
question_num
<=
6
:
keys_to_check
.
discard
(
"
clarity_of_concept
"
)
elif
6
<
question_num
<=
12
:
keys_to_check
.
discard
(
"
creativity
"
)
elif
12
<
question_num
<=
18
:
keys_to_check
.
discard
(
"
clarity_of_concept
"
)
keys_to_check
.
discard
(
"
creativity
"
)
for
key
in
keys_to_check
:
if
rated_param
[
key
]
==
"
ai
"
:
ai
+=
1
elif
rated_param
[
key
]
==
"
human
"
:
human
+=
1
# Determine result
if
human
>
ai
:
predicted_tags
[
question_num
]
=
"
human
"
elif
human
<
ai
:
predicted_tags
[
question_num
]
=
"
ai
"
else
:
predicted_tags
[
question_num
]
=
"
equal
"
return
predicted_tags
if
__name__
==
'
__main__
'
:
survey_texts
=
get_all_data_from_folder
(
"
data
"
,
"
txt
"
)
evaluated_texts
=
calculate_scores_texts
(
survey_texts
)
# I manually ordered the texts in the order used in the survey
survey_ai_texts
=
[
'
ai
\\
gpt2_poem.txt
\\
0
'
,
'
ai
\\
gpt2_poem.txt
\\
1
'
,
'
ai
\\
opt_poem.txt
\\
0
'
,
'
ai
\\
opt_poem.txt
\\
1
'
,
'
ai
\\
gpt4o_poem.txt
\\
0
'
,
'
ai
\\
gpt4o_poem.txt
\\
1
'
,
'
ai
\\
gpt4o_wiki.txt
\\
0
'
,
'
ai
\\
gpt4o_wiki.txt
\\
1
'
,
'
ai
\\
opt_wiki.txt
\\
0
'
,
'
ai
\\
opt_wiki.txt
\\
1
'
,
'
ai
\\
gpt2_wiki.txt
\\
0
'
,
'
ai
\\
gpt2_wiki.txt
\\
1
'
,
'
ai
\\
opt_sport.txt
\\
0
'
,
'
ai
\\
opt_sport.txt
\\
1
'
,
'
ai
\\
gpt4o_sports.txt
\\
0
'
,
'
ai
\\
gpt4o_sports.txt
\\
1
'
,
'
ai
\\
gpt2_sport.txt
\\
0
'
,
'
ai
\\
gpt2_sport.txt
\\
1
'
]
survey_human_texts
=
[
"
human
\\
poetry.txt
\\
0
"
,
'
human
\\
poetry.txt
\\
1
'
,
'
human
\\
poetry.txt
\\
2
'
,
'
human
\\
poetry.txt
\\
3
'
,
'
human
\\
poetry.txt
\\
4
'
,
'
human
\\
poetry.txt
\\
5
'
,
'
human
\\
wiki.txt
\\
0
'
,
'
human
\\
wiki.txt
\\
1
'
,
'
human
\\
wiki.txt
\\
2
'
,
'
human
\\
wiki.txt
\\
3
'
,
'
human
\\
wiki.txt
\\
4
'
,
'
human
\\
wiki.txt
\\
5
'
,
'
human
\\
sport_bbc.txt
\\
0
'
,
'
human
\\
sport_bbc.txt
\\
1
'
,
'
human
\\
sport_bbc.txt
\\
2
'
,
'
human
\\
sport_bbc.txt
\\
3
'
,
'
human
\\
sport_bbc.txt
\\
4
'
,
'
human
\\
sport_bbc.txt
\\
5
'
]
survey_groups
=
zip
(
survey_ai_texts
,
survey_human_texts
)
# Rate parameters Coherence, Creativity, Conciseness, Clarity of Concepts between survey groups
survey_assessment
=
{}
for
i
,
group
in
enumerate
(
survey_groups
,
start
=
1
):
for
idx
,
name
in
enumerate
(
group
):
if
idx
<
len
(
group
)
-
1
:
metrics_ai
=
evaluated_texts
[
name
]
metrics_human
=
evaluated_texts
[
group
[
idx
+
1
]]
evaluation_metrics
=
Calculate_Parameters
(
metrics_ai
,
metrics_human
,
i
)
coherence_score
=
evaluation_metrics
.
calculate_coherence
()
conciseness_score
=
evaluation_metrics
.
calculate_conciseness
()
creativity_score
=
evaluation_metrics
.
calculate_creativity
()
clarity_score
=
evaluation_metrics
.
calculate_clarity_of_concept
()
survey_assessment
[
i
]
=
{
"
coherence
"
:
coherence_score
,
"
conciseness
"
:
conciseness_score
,
"
creativity
"
:
creativity_score
,
"
clarity_of_concept
"
:
clarity_score
}
# Automatically asses if text is human or ai genareted
result
=
predict_human_ai
(
survey_assessment
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment