Newer
Older
from sklearn.feature_extraction.text import CountVectorizer
def extract(corpus_instance, vocabulary):
"""
Extracts n-gram features from a single corpus instance.
n depends on vocabulary, which needs to be extracted using get_vocabulary.
Returns numpy array of size of vocabulary
"""
vectorizer = CountVectorizer(vocabulary=vocabulary)
vector = vectorizer.transform([corpus_instance['REVIEW']]) # takes a list
return vector.toarray()[0]
def get_vocabulary(corpus, n):
"""
Creates vocabulary based on given corpus.
"""
all_reviews = []
for line in corpus:
all_reviews.append(line['REVIEW'])
vectorizer = CountVectorizer(ngram_range=(n, n))
vectorizer.fit(all_reviews)
return vectorizer.vocabulary_