Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
softwareprojektws17
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Steffen Knapp
softwareprojektws17
Commits
71b30bfa
Commit
71b30bfa
authored
7 years ago
by
Maximilian Blunck
Browse files
Options
Downloads
Plain Diff
Merge branch 'master' of
https://gitlab.cl.uni-heidelberg.de/knapps/softwareprojektws17
parents
65340bcf
5c99e1e1
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
postagger.py
+106
-32
106 additions, 32 deletions
postagger.py
with
106 additions
and
32 deletions
postagger.py
+
106
−
32
View file @
71b30bfa
...
...
@@ -3,56 +3,130 @@ from nltk.tokenize import word_tokenize
from
corpus
import
read_corpus
"""
turning the entire corpus into a bag of words (lemmas).
returns: list
TODO
- something useful to do with the feature vectors
"""
def
to_bag_of_words
(
corpus
):
for
entry
in
corpus
:
for
word
in
word_tokenize
(
str
(
entry
[
'
REVIEW
'
])):
if
word
not
in
bag_of_words
:
bag_of_words
.
append
(
word
)
return
bag_of_words
"""
pos-tagging the entire corpus token-wise.
Returns the raw corpus as a list
e.g. [[(
'
No
'
,
'
DT
'
)], [(
'
Just
'
,
'
RB
'
), (
'
no
'
,
'
DT
'
)]]
"""
def
corpus_pos_tagger
(
corpus
):
tagged_corpus
=
[]
temp_entry
=
[]
for
entry
in
corpus
:
tagged_corpus
.
append
(
nltk
.
pos_tag
(
word_tokenize
(
str
(
entry
[
'
REVIEW
'
]))))
return
tagged_corpus
temp_entry
=
nltk
.
pos_tag
(
word_tokenize
(
str
(
entry
[
'
REVIEW
'
])))
tagged_corpus
.
append
(
temp_entry
)
temp_entry
=
[]
return
(
tagged_corpus
)
"""
for each review in the corpus, the number of occurences of each token is written
into a feature vector of the same length as the bag of words list.
returns: list of lists
Same format as above, reduces the tuples to pos-tags
e.g. [[
'
DT
'
,
'
,
'
,
'
NN
'
], [
'
DT
'
,
'
,
'
,
'
NN
'
]]
"""
def
to_vector
(
bag_of_words
,
corpus
):
sentence_vector_list
=
[]
def
tagged_corpus_to_pos_unigrams
(
tagged_corpus
):
pos_unigrams
=
[]
temp_pos
=
[]
for
entry
in
tagged_corpus
:
for
token
in
entry
:
temp_pos
.
append
(
token
[
1
])
pos_unigrams
.
append
(
temp_pos
)
temp_pos
=
[]
return
pos_unigrams
"""
Returns the bigrams for each review
e.g. [[(
'
DT
'
,
'
,
'
), (
'
,
'
,
'
NN
'
)], [(
'
DT
'
,
'
,
'
), (
'
,
'
,
'
NN
'
)]]
"""
def
pos_unigrams_to_bigrams
(
input_list
):
bigram_list
=
[]
temp_bigram
=
[]
for
review
in
input_list
:
for
i
in
range
(
len
(
review
)
-
1
):
temp_bigram
.
append
((
review
[
i
],
review
[
i
+
1
]))
bigram_list
.
append
(
temp_bigram
)
temp_bigram
=
[]
return
bigram_list
"""
Takes all the bigrams and turns them into a bag of bigrams
e.g. [(
'
DT
'
,
'
,
'
), (
'
,
'
,
'
NN
'
)]
"""
def
to_bag_of_bigrams
(
bigram_list
):
bag_of_bigrams
=
[]
for
review
in
bigram_list
:
for
bigram
in
review
:
if
bigram
not
in
bag_of_bigrams
:
bag_of_bigrams
.
append
(
bigram
)
return
bag_of_bigrams
"""
TODO: explanation that
'
s not stupid
"""
def
to_bigram_vector
(
bag_of_bigrams
,
corpus
):
#corpus is the bigram_list
review_vector_list
=
[]
for
entry
in
corpus
:
sentence_vector
=
[]
review
=
word_tokenize
(
str
(
entry
[
'
REVIEW
'
]))
review_vector
=
[]
for
bigram
in
bag_of_bigrams
:
review_vector
.
append
(
entry
.
count
(
bigram
))
review_vector_list
.
append
(
review_vector
)
return
review_vector_list
"""
The functions below are intended to be used on token-level (bag of words)
"""
def
to_token_vector
(
bag_of_words
,
corpus
):
review_vector_list
=
[]
for
entry
in
corpus
:
review_vector
=
[]
review
=
word_tokenize
(
str
(
entry
[
'
REVIEW
'
]))
for
word
in
bag_of_words
:
sentence_vector
.
append
(
review
.
count
(
word
))
review_vector
.
append
(
review
.
count
(
word
))
review_vector_list
.
append
(
review_vector
)
return
review_vector_list
sentence_vector_list
.
append
(
sentence_vector
)
return
sentence_vector_list
def
to_bag_of_words
(
corpus
):
bag_of_words
=
[]
for
entry
in
corpus
:
for
word
in
word_tokenize
(
str
(
entry
[
'
REVIEW
'
])):
if
word
not
in
bag_of_words
:
bag_of_words
.
append
(
word
)
return
bag_of_words
#fun fact: len(bag_of_words) is 25325 for corpus.csv
if
__name__
==
'
__main__
'
:
corpus
=
read_corpus
(
"
minicorpus.csv
"
)
bag_of_words
=
[]
tagged_corpus
=
[]
bag_of_words
=
to_bag_of_words
(
corpus
)
#das sollte rausgenommen werden, wenn mit dem kompletten korpus gearbeitet wird
for
vektor
in
to_vector
(
bag_of_words
,
corpus
):
print
(
str
(
vektor
)
+
"
\n
"
)
tagged_corpus
=
corpus_pos_tagger
(
corpus
)
pos_unigrams
=
tagged_corpus_to_pos_unigrams
(
tagged_corpus
)
pos_bigrams
=
pos_unigrams_to_bigrams
(
pos_unigrams
)
bag_of_bigrams
=
to_bag_of_bigrams
(
pos_bigrams
)
if
len
(
bag_of_words
)
!=
len
(
to_vector
(
bag_of_words
,
corpus
)[
0
]):
print
(
"
Irgendwas lief schief (Featurevektor und Bag of Words nicht gleich lang)
"
)
corpus_vector
=
to_bigram_vector
(
bag_of_bigrams
,
pos_bigrams
)
for
vector
in
corpus_vector
:
print
(
vector
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment