Skip to content
Snippets Groups Projects
Commit cfcb855f authored by chrysanthopoulou's avatar chrysanthopoulou
Browse files

Add singular computation of fanfic groups

parent 73ec47c9
No related branches found
No related tags found
No related merge requests found
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
......@@ -322,7 +322,7 @@ class StylometryMetrics:
tk_len_dist, tk_len_std, tk_len_mean = mendenhall_token_metrics(clean_tk)
tk_len_stds.append(tk_len_std)
tk_len_means.append(tk_len_mean)
avg_tk_len_dist = tk_len_dist
avg_tk_len_dist = tk_len_dist.to_frame().T
except:
continue
else:
......@@ -330,14 +330,13 @@ class StylometryMetrics:
tk_len_dist, tk_len_std, tk_len_mean = mendenhall_token_metrics(clean_tk)
tk_len_stds.append(tk_len_std)
tk_len_means.append(tk_len_mean)
avg_tk_len_dist = pd.concat([avg_tk_len_dist, tk_len_dist], axis=0, join='outer')
avg_tk_len_dist = pd.concat([avg_tk_len_dist, tk_len_dist.to_frame().T], axis=0, join='outer')
except:
continue
self.tk_len_std = statistics.mean(tk_len_stds)
self.tk_len_mean = statistics.mean(tk_len_means)
self.tk_len_dist = avg_tk_len_dist.mean().T
print(self.tk_len_dist)
print(type(self.tk_len_dist))
avg_tk_len_dist = avg_tk_len_dist.fillna(0) #if there are zero counts of 18 letter tokens, it should be counted as 0, not as NaN, i.e., not be accounted on in the average
self.tk_len_dist = avg_tk_len_dist.mean()
else:
self.tk_len_dist, self.tk_len_std, self.tk_len_mean = mendenhall_token_metrics(self.clean_tokens)
......@@ -349,14 +348,16 @@ class StylometryMetrics:
clean_tk = tokenize_and_clean_text(text)
if index == 0:
tag_freq_dist, md_freq_dist = pos_tag_freq(clean_tk)
avg_tag_freq_dist = tag_freq_dist
avg_md_freq_dist = md_freq_dist
avg_tag_freq_dist = tag_freq_dist.to_frame().T
avg_md_freq_dist = md_freq_dist.to_frame().T
else:
tag_freq_dist, md_freq_dist = pos_tag_freq(clean_tk)
avg_tag_freq_dist = pd.concat([avg_tag_freq_dist, tag_freq_dist], axis=0, join='outer')
avg_md_freq_dist = pd.concat([avg_md_freq_dist, md_freq_dist], axis=0, join='outer')
self.tag_freq_dist = avg_tag_freq_dist.mean().T
self.md_freq_dist = avg_md_freq_dist.mean().T
avg_tag_freq_dist = pd.concat([avg_tag_freq_dist, tag_freq_dist.to_frame().T], axis=0, join='outer')
avg_md_freq_dist = pd.concat([avg_md_freq_dist, md_freq_dist.to_frame().T], axis=0, join='outer')
avg_md_freq_dist = avg_md_freq_dist.fillna(0)
avg_tag_freq_dist = avg_tag_freq_dist.fillna(0)
self.tag_freq_dist = avg_tag_freq_dist.mean()
self.md_freq_dist = avg_md_freq_dist.mean()
else:
self.tag_freq_dist, self.md_freq_dist = pos_tag_freq(self.clean_tokens)
......@@ -366,11 +367,12 @@ class StylometryMetrics:
for index, row in self.data.iterrows():
text = row["body"]
if index == 0:
avg_punct_tag_freq_dist = calculate_punct_dist(text)
avg_punct_tag_freq_dist = calculate_punct_dist(text).to_frame().T
else:
punct_tag_freq_dist = calculate_punct_dist(text)
avg_punct_tag_freq_dist = pd.concat([avg_punct_tag_freq_dist, punct_tag_freq_dist], axis=0, join='outer')
self.punct_tag_freq_dist = avg_punct_tag_freq_dist.mean().T
avg_punct_tag_freq_dist = pd.concat([avg_punct_tag_freq_dist, punct_tag_freq_dist.to_frame().T], axis=0, join='outer')
avg_punct_tag_freq_dist = avg_punct_tag_freq_dist.fillna(0)
self.punct_tag_freq_dist = avg_punct_tag_freq_dist.mean()
else:
self.punct_tag_freq_dist = calculate_punct_dist(self.text)
......@@ -385,21 +387,26 @@ class StylometryMetrics:
sent_len_dist, sent_len_dist_short, sent_std_dev, sent_mean = calculate_sent_len_dist(text)
sent_means.append(sent_mean)
sent_std_devs.append(sent_std_dev)
avg_sent_len_dist = sent_len_dist
avg_sent_len_dist_short = sent_len_dist_short
avg_sent_len_dist = sent_len_dist.to_frame().T
avg_sent_len_dist_short = sent_len_dist_short.to_frame().T
#print(avg_sent_len_dist)
except: continue
else:
try:
sent_len_dist, sent_len_dist_short, sent_std_dev, sent_mean = calculate_sent_len_dist(text)
sent_means.append(sent_mean)
sent_std_devs.append(sent_std_dev)
avg_sent_len_dist = pd.concat([avg_sent_len_dist, sent_len_dist], axis=0, join='outer')
avg_sent_len_dist_short = pd.concat([avg_sent_len_dist_short, sent_len_dist_short], axis=0, join='outer')
avg_sent_len_dist = pd.concat([avg_sent_len_dist, sent_len_dist.to_frame().T], axis=0, join='outer')
#print(avg_sent_len_dist)
avg_sent_len_dist_short = pd.concat([avg_sent_len_dist_short, sent_len_dist_short.to_frame().T], axis=0, join='outer')
except: continue
self.sent_std_dev = statistics.mean(sent_std_devs)
self.sent_mean = statistics.mean(sent_means)
self.sent_len_dist = avg_sent_len_dist.mean().T
self.sent_len_dist_short = avg_sent_len_dist_short.mean().T
avg_sent_len_dist = avg_sent_len_dist.fillna(0)
self.sent_len_dist = avg_sent_len_dist.mean()
avg_sent_len_dist_short = avg_sent_len_dist_short.fillna(0)
self.sent_len_dist_short = avg_sent_len_dist_short.mean()
print(avg_sent_len_dist)
else:
self.sent_len_dist, self.sent_len_dist_short, self.sent_std_dev, self.sent_mean = calculate_sent_len_dist(self.text)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment