ngram_feature.py

from sklearn.feature_extraction.text import CountVectorizer


def extract(corpus_instance, vocabulary):
	"""
	Extracts n-gram features from a single corpus instance.
	n depends on vocabulary, which needs to be extracted using get_vocabulary.
	Returns numpy array of size of vocabulary
	"""
	vectorizer = CountVectorizer(vocabulary=vocabulary)
	vector = vectorizer.transform([corpus_instance['REVIEW']]) # takes a list

	return vector.toarray()[0]


def get_vocabulary(corpus, n):
	"""
	Creates vocabulary based on given corpus.
	"""
	all_reviews = []
	for line in corpus:
		all_reviews.append(line['REVIEW'])

	vectorizer = CountVectorizer(ngram_range=(n, n))
	vectorizer.fit(all_reviews)

	return vectorizer.vocabulary_