Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import sent_rating_feature
import ngram_feature
import pos_feature
import punctuation_feature
import contrast_feature
import numpy as np
import config
def create_vector(corpus_instance, uni_gram_vocab=None, pos_vocabulary=None, surface_vocabulary=None):
"""
Calls all feature extraction programms and combines
resulting arrays to a single input vector (for a
single corpus instance)
Example for corpus instance: OrderedDict([('LABEL', '0'), ('STARS', '5.0'), etc.
"""
f_map = {'f1' : ngram_feature.extract(corpus_instance, 'REVIEW', uni_gram_vocab),
'f2' : pos_feature.extract(corpus_instance, pos_vocabulary),
'f3' : ngram_feature.extract(corpus_instance, 'SURFACE_PATTERNS', surface_vocabulary),
'f4' : sent_rating_feature.extract(corpus_instance),
'f5' : punctuation_feature.extract(corpus_instance),
'f6' : contrast_feature.extract(corpus_instance)
}
vector = f_map[config.feature_selection[0]]
if len(config.feature_selection) > 1:
for i in range(1, len(config.feature_selection)):
vector = np.append(vector, f_map[config.feature_selection[i]])
return vector
def extract_features(train_set, test_set):
# vocabularies
n_gram_vocab = ngram_feature.get_vocabulary(train_set, 'REVIEW', config.n_range_words)
pos_bigram_vocab = pos_feature.get_pos_vocabulary(train_set)
surface_bigram_vocab = ngram_feature.get_vocabulary(train_set, 'SURFACE_PATTERNS', config.n_range_surface_patterns)
# inputs:
print("------Feature Extraction------\n")
train_inputs = [create_vector(el, n_gram_vocab, pos_bigram_vocab, surface_bigram_vocab) #, bi_gram_vocab, tri_gram_vocab
for el in train_set] # 1000 vectors
test_inputs = [create_vector(el, n_gram_vocab, pos_bigram_vocab, surface_bigram_vocab) #, bi_gram_vocab, tri_gram_vocab
for el in test_set] # 254 vectors
# stats
print("Number of train samples: {}".format(len(train_inputs)))
print("N_gram vocab size: {}".format(len(n_gram_vocab)))
print("POS-Bigram vocab size: {}".format(len(pos_bigram_vocab)))
print("SP-Bigram vocab size: {}".format(len(surface_bigram_vocab)))
print("Total features per train sample: {}".format(len(train_inputs[0])))
#print("---> Duration Feature Extraction: {} sec.\n".format(int(time.time()-start_time)))
return train_inputs, test_inputs