Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
L
Lsem-RC in nominal compounds
Manage
Activity
Members
Plan
Wiki
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Terraform modules
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
burkhardt
Lsem-RC in nominal compounds
Commits
2446fb61
Commit
2446fb61
authored
3 years ago
by
engel
Browse files
Options
Downloads
Patches
Plain Diff
Add header to readme
parent
89e1f55e
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
temp/README.md
+4
-0
4 additions, 0 deletions
temp/README.md
temp/data_split.ipynb
+0
-474
0 additions, 474 deletions
temp/data_split.ipynb
with
4 additions
and
474 deletions
temp/README.md
+
4
−
0
View file @
2446fb61
# Data
## Table of Contents
-
[
Data
](
#data
)
-
[
Table of Contents
](
#table-of-contents
)
-
[
Searching for data
](
#searching-for-data
)
-
[
Forcing BERTs Attention on the compound
](
#forcing-berts-attention-on-the-compound
)
-
[
Limiting NC Occurrences
](
#limiting-nc-occurrences
)
...
...
This diff is collapsed.
Click to expand it.
temp/data_split.ipynb
deleted
100644 → 0
+
0
−
474
View file @
89e1f55e
{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"sentences_fine_200.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"104350.8\n",
"26087.7\n",
"43479.5\n"
]
}
],
"source": [
"sent_amount = len(df)\n",
"perc_train = sent_amount*0.6\n",
"perc_test = sent_amount*0.25\n",
"perc_val = sent_amount*0.15\n",
"print(perc_train)\n",
"print(perc_val)\n",
"print(perc_test)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# Create different DataFrames for each set\n",
"train_set = pd.DataFrame(columns = ['Relation', 'Label', 'NC', 'Sentence'])\n",
"val_set = pd.DataFrame(columns = ['Relation', 'Label', 'NC', 'Sentence'])\n",
"test_set = pd.DataFrame(columns = ['Relation', 'Label', 'NC', 'Sentence'])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sum 0\n",
"Label 1\n",
" not over 1210.2\n",
"sum 1197\n",
"Label 2\n",
" not over 1104.6\n",
"sum 2301\n",
"Label 3\n",
" not over 930.5999999999999\n",
"sum 3231\n",
"Label 4\n",
" not over 4637.4\n",
"sum 7868\n",
"Label 5\n",
" not over 1587.0\n",
"sum 9453\n",
"Label 6\n",
" not over 3437.4\n",
"sum 12890\n",
"Label 7\n",
" not over 6597.599999999999\n",
"sum 19797\n",
"Label 8\n",
" not over 297.59999999999997\n",
"sum 20094\n",
"Label 9\n",
" not over 103.8\n",
"sum 20192\n",
"Label 10\n",
" not over 1199.3999999999999\n",
"sum 21390\n",
"Label 11\n",
" not over 5962.8\n",
"sum 27352\n",
"Label 12\n",
" not over 1728.0\n",
"sum 29079\n",
"Label 13\n",
" not over 6036.599999999999\n",
"sum 35115\n",
"Label 14\n",
" not over 731.4\n",
"sum 35846\n",
"Label 15\n",
" not over 15852.0\n",
"sum 51706\n",
"Label 16\n",
" not over 1271.3999999999999\n",
"sum 52977\n",
"Label 17\n",
" not over 1659.6\n",
"sum 54732\n",
"Label 18\n",
" not over 4556.4\n",
"sum 59292\n",
"Label 19\n",
" not over 2518.7999999999997\n",
"sum 61824\n",
"Label 20\n",
" not over 1667.3999999999999\n",
"sum 63489\n",
"Label 21\n",
" not over 385.2\n",
"sum 63871\n",
"Label 22\n",
" not over 13704.0\n",
"sum 77574\n",
"Label 23\n",
" not over 2521.2\n",
"sum 80098\n",
"Label 24\n",
" not over 6627.599999999999\n",
"sum 86737\n",
"Label 25\n",
" not over 3777.0\n",
"sum 90523\n",
"Label 26\n",
" not over 1959.6\n",
"sum 92483\n",
"Label 27\n",
" not over 2916.6\n",
"sum 95400\n",
"Label 28\n",
" not over 819.6\n",
"sum 96217\n",
"Label 29\n",
" not over 5308.8\n",
"sum 100331\n",
"Label 30\n",
" not over 156.6\n",
"sum 100392\n",
"Label 31\n",
" not over 319.8\n",
"sum 100708\n",
"Label 32\n",
" not over 789.0\n",
"sum 101496\n",
"Label 33\n",
" not over 60.599999999999994\n",
"sum 101513\n",
"Label 34\n",
" not over 474.0\n",
"sum 101975\n",
"Label 35\n",
" not over 1441.2\n"
]
}
],
"source": [
"grouped_NC = df.groupby(['Label', 'NC']).size().reset_index(name=\"Count\")\n",
"grouped_rel = df.groupby(['Label']).size().reset_index(name=\"Count_rel\")\n",
"\n",
"sum = 0\n",
"# iterate over relations to get amount of sentences from each relation\n",
"for index, row in grouped_rel.iterrows():\n",
" count = row['Count_rel']\n",
" amount = count*0.6\n",
" print(f\"sum {sum}\")\n",
" print(\"Label {}\".format(row['Label']))\n",
" print(f\" not over {amount}\")\n",
" # iterate over NC of relation to get sentences and dont split up NC in different sets\n",
" # amount of sentences should stay below amount calculated above\n",
" if sum < perc_train:\n",
" sum2 = 0\n",
"\n",
" # get only nc from relevant relation\n",
" NC_rel = grouped_NC.where(grouped_NC['Label']== row['Label']).dropna().reset_index(drop=True)\n",
" for index2, row2 in NC_rel.iterrows():\n",
" count2 = row2['Count']\n",
" temp = sum2 + count2\n",
" if temp < amount and sum+sum2 < perc_train:\n",
" df1 = df[df['NC'].str.contains(row2['NC'])]\n",
" train_set = pd.concat([train_set, df1], axis=0)\n",
" sum2 += count2\n",
" \n",
" sum = len(train_set)\n",
" train_set.to_csv(f'./train_data/train_set.csv') \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of Relation Label NC \\\n",
"1173 ADJ-LIKE_NOUN 1 mass destruction \n",
"1174 ADJ-LIKE_NOUN 1 mass destruction \n",
"1175 ADJ-LIKE_NOUN 1 mass destruction \n",
"1176 ADJ-LIKE_NOUN 1 mass destruction \n",
"1177 ADJ-LIKE_NOUN 1 mass destruction \n",
"... ... ... ... \n",
"173965 WHOLE+PART_OR_MEMBER_OF 35 wing tip \n",
"173966 WHOLE+PART_OR_MEMBER_OF 35 wing tip \n",
"173967 WHOLE+PART_OR_MEMBER_OF 35 wing tip \n",
"173968 WHOLE+PART_OR_MEMBER_OF 35 wing tip \n",
"173969 WHOLE+PART_OR_MEMBER_OF 35 wing tip \n",
"\n",
" Sentence \n",
"1173 one used a primitive revolver; the other a wea... \n",
"1174 american obligations to come to israel’s defen... \n",
"1175 archbishop jose h. gomez of los angeles in a j... \n",
"1176 as robert draper recently us, those in the adm... \n",
"1177 didn’t powell say that iraq had ‘weapons of ma... \n",
"... ... \n",
"173965 snip out the bird’s backbone and add it to the... \n",
"173966 the arrest was made following tip-off received... \n",
"173967 the incident occurred when police, including g... \n",
"173968 apical meristem or growing tip. \n",
"173969 lesser is a smaller bird, with slimmer build, ... \n",
"\n",
"[70748 rows x 4 columns]>\n"
]
}
],
"source": [
"merge_set = pd.merge(df, train_set, how='outer', indicator=True)\n",
"reduced = merge_set.loc[merge_set._merge == 'left_only', ['Relation','Label','NC','Sentence']]\n",
"print(reduced.head)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sum 0\n",
"Label 1\n",
" not over 123.0\n",
"sum 116\n",
"Label 2\n",
" not over 110.55\n",
"sum 224\n",
"Label 3\n",
" not over 93.14999999999999\n",
"sum 317\n",
"Label 4\n",
" not over 463.79999999999995\n",
"sum 780\n",
"Label 5\n",
" not over 159.0\n",
"sum 937\n",
"Label 6\n",
" not over 343.8\n",
"sum 1280\n",
"Label 7\n",
" not over 613.35\n",
"sum 1893\n",
"Label 8\n",
" not over 29.849999999999998\n",
"sum 1919\n",
"Label 9\n",
" not over 11.25\n",
"sum 1919\n",
"Label 10\n",
" not over 120.14999999999999\n",
"sum 2036\n",
"Label 11\n",
" not over 595.9499999999999\n",
"sum 2631\n",
"Label 12\n",
" not over 172.95\n",
"sum 2803\n",
"Label 13\n",
" not over 603.75\n",
"sum 3405\n",
"Label 14\n",
" not over 73.2\n",
"sum 3477\n",
"Label 15\n",
" not over 1585.35\n",
"sum 5062\n",
"Label 16\n",
" not over 127.19999999999999\n",
"sum 5180\n",
"Label 17\n",
" not over 166.04999999999998\n",
"sum 5346\n",
"Label 18\n",
" not over 455.7\n",
"sum 5801\n",
"Label 19\n",
" not over 252.14999999999998\n",
"sum 6052\n",
"Label 20\n",
" not over 167.4\n",
"sum 6216\n",
"Label 21\n",
" not over 39.0\n",
"sum 6251\n",
"Label 22\n",
" not over 1356.1499999999999\n",
"sum 7607\n",
"Label 23\n",
" not over 252.14999999999998\n",
"sum 7859\n",
"Label 24\n",
" not over 662.85\n",
"sum 8521\n",
"Label 25\n",
" not over 376.34999999999997\n",
"sum 8897\n",
"Label 26\n",
" not over 196.04999999999998\n",
"sum 9093\n",
"Label 27\n",
" not over 291.75\n",
"sum 9382\n",
"Label 28\n",
" not over 82.35\n",
"sum 9460\n",
"Label 29\n",
" not over 710.1\n",
"sum 10170\n",
"Label 30\n",
" not over 30.0\n",
"sum 10170\n",
"Label 31\n",
" not over 32.55\n",
"sum 10200\n",
"Label 32\n",
" not over 79.05\n",
"sum 10279\n",
"Label 33\n",
" not over 12.6\n",
"sum 10279\n",
"Label 34\n",
" not over 49.199999999999996\n",
"sum 10279\n",
"Label 35\n",
" not over 174.45\n"
]
}
],
"source": [
"val_set = pd.DataFrame(columns = ['Relation', 'Label', 'NC', 'Sentence'])\n",
"grouped_NC = reduced.groupby(['Label', 'NC']).size().reset_index(name=\"Count\")\n",
"grouped_rel = reduced.groupby(['Label']).size().reset_index(name=\"Count_rel\")\n",
"\n",
"sum = 0\n",
"# iterate over relations to get amount of sentences from each relation\n",
"for index, row in grouped_rel.iterrows():\n",
" count = row['Count_rel']\n",
" amount = count*0.15\n",
" print(f\"sum {sum}\")\n",
" print(\"Label {}\".format(row['Label']))\n",
" print(f\" not over {amount}\")\n",
"\n",
" # iterate over NC of relation to get sentences and dont split up NC in different sets\n",
" # amount of sentences should stay below amount calculated above\n",
" if sum < perc_val:\n",
" sum2 = 0\n",
" # get only nc from relevant relation\n",
" NC_rel = grouped_NC.where(grouped_NC['Label']== row['Label']).dropna().reset_index(drop=True)\n",
" for index2, row2 in NC_rel.iterrows():\n",
" count2 = row2['Count']\n",
" temp = sum2 + count2\n",
" if temp < amount and sum+sum2 < perc_val:\n",
" df1 = reduced[reduced['NC'].str.contains(row2['NC'])]\n",
" val_set = pd.concat([val_set, df1], axis=0)\n",
" sum2 += count2\n",
" sum = len(val_set)\n",
" val_set.to_csv(f'./val_data/val_set.csv') "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<bound method NDFrame.head of Relation Label NC \\\n",
"116 ADJ-LIKE_NOUN 1 mass extinction \n",
"117 ADJ-LIKE_NOUN 1 mass extinction \n",
"118 ADJ-LIKE_NOUN 1 mass extinction \n",
"119 ADJ-LIKE_NOUN 1 mass extinction \n",
"120 ADJ-LIKE_NOUN 1 mass extinction \n",
"... ... ... ... \n",
"70743 WHOLE+PART_OR_MEMBER_OF 35 wing tip \n",
"70744 WHOLE+PART_OR_MEMBER_OF 35 wing tip \n",
"70745 WHOLE+PART_OR_MEMBER_OF 35 wing tip \n",
"70746 WHOLE+PART_OR_MEMBER_OF 35 wing tip \n",
"70747 WHOLE+PART_OR_MEMBER_OF 35 wing tip \n",
"\n",
" Sentence \n",
"116 earth is at the start of a sixth mass extincti... \n",
"117 the heisei era indicates that godzilla was a d... \n",
"118 the impact would have thrown trillions of tons... \n",
"119 across england and wales, towns and villages a... \n",
"120 geographically widespread organisms fare bette... \n",
"... ... \n",
"70743 snip out the bird’s backbone and add it to the... \n",
"70744 the arrest was made following tip-off received... \n",
"70745 the incident occurred when police, including g... \n",
"70746 apical meristem or growing tip. \n",
"70747 lesser is a smaller bird, with slimmer build, ... \n",
"\n",
"[60295 rows x 4 columns]>\n"
]
}
],
"source": [
"merge_set2 = pd.merge(reduced, val_set, how='outer', indicator=True)\n",
"test_set = merge_set2.loc[merge_set2._merge == 'left_only', ['Relation','Label','NC','Sentence']]\n",
"print(test_set.head)\n",
"\n",
"test_set.to_csv(f'./test_data/test_set.csv') "
]
}
],
"metadata": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
},
"kernelspec": {
"display_name": "Python 3.7.3 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
%% Cell type:code id: tags:
```
python
import
random
import
pandas
as
pd
```
%% Cell type:code id: tags:
```
python
df
=
pd
.
read_csv
(
"
sentences_fine_200.csv
"
)
```
%% Cell type:code id: tags:
```
python
sent_amount
=
len
(
df
)
perc_train
=
sent_amount
*
0.6
perc_test
=
sent_amount
*
0.25
perc_val
=
sent_amount
*
0.15
print
(
perc_train
)
print
(
perc_val
)
print
(
perc_test
)
```
%% Output
104350.8
26087.7
43479.5
%% Cell type:code id: tags:
```
python
# Create different DataFrames for each set
train_set
=
pd
.
DataFrame
(
columns
=
[
'
Relation
'
,
'
Label
'
,
'
NC
'
,
'
Sentence
'
])
val_set
=
pd
.
DataFrame
(
columns
=
[
'
Relation
'
,
'
Label
'
,
'
NC
'
,
'
Sentence
'
])
test_set
=
pd
.
DataFrame
(
columns
=
[
'
Relation
'
,
'
Label
'
,
'
NC
'
,
'
Sentence
'
])
```
%% Cell type:code id: tags:
```
python
grouped_NC
=
df
.
groupby
([
'
Label
'
,
'
NC
'
]).
size
().
reset_index
(
name
=
"
Count
"
)
grouped_rel
=
df
.
groupby
([
'
Label
'
]).
size
().
reset_index
(
name
=
"
Count_rel
"
)
sum
=
0
# iterate over relations to get amount of sentences from each relation
for
index
,
row
in
grouped_rel
.
iterrows
():
count
=
row
[
'
Count_rel
'
]
amount
=
count
*
0.6
print
(
f
"
sum
{
sum
}
"
)
print
(
"
Label {}
"
.
format
(
row
[
'
Label
'
]))
print
(
f
"
not over
{
amount
}
"
)
# iterate over NC of relation to get sentences and dont split up NC in different sets
# amount of sentences should stay below amount calculated above
if
sum
<
perc_train
:
sum2
=
0
# get only nc from relevant relation
NC_rel
=
grouped_NC
.
where
(
grouped_NC
[
'
Label
'
]
==
row
[
'
Label
'
]).
dropna
().
reset_index
(
drop
=
True
)
for
index2
,
row2
in
NC_rel
.
iterrows
():
count2
=
row2
[
'
Count
'
]
temp
=
sum2
+
count2
if
temp
<
amount
and
sum
+
sum2
<
perc_train
:
df1
=
df
[
df
[
'
NC
'
].
str
.
contains
(
row2
[
'
NC
'
])]
train_set
=
pd
.
concat
([
train_set
,
df1
],
axis
=
0
)
sum2
+=
count2
sum
=
len
(
train_set
)
train_set
.
to_csv
(
f
'
./train_data/train_set.csv
'
)
```
%% Output
sum 0
Label 1
not over 1210.2
sum 1197
Label 2
not over 1104.6
sum 2301
Label 3
not over 930.5999999999999
sum 3231
Label 4
not over 4637.4
sum 7868
Label 5
not over 1587.0
sum 9453
Label 6
not over 3437.4
sum 12890
Label 7
not over 6597.599999999999
sum 19797
Label 8
not over 297.59999999999997
sum 20094
Label 9
not over 103.8
sum 20192
Label 10
not over 1199.3999999999999
sum 21390
Label 11
not over 5962.8
sum 27352
Label 12
not over 1728.0
sum 29079
Label 13
not over 6036.599999999999
sum 35115
Label 14
not over 731.4
sum 35846
Label 15
not over 15852.0
sum 51706
Label 16
not over 1271.3999999999999
sum 52977
Label 17
not over 1659.6
sum 54732
Label 18
not over 4556.4
sum 59292
Label 19
not over 2518.7999999999997
sum 61824
Label 20
not over 1667.3999999999999
sum 63489
Label 21
not over 385.2
sum 63871
Label 22
not over 13704.0
sum 77574
Label 23
not over 2521.2
sum 80098
Label 24
not over 6627.599999999999
sum 86737
Label 25
not over 3777.0
sum 90523
Label 26
not over 1959.6
sum 92483
Label 27
not over 2916.6
sum 95400
Label 28
not over 819.6
sum 96217
Label 29
not over 5308.8
sum 100331
Label 30
not over 156.6
sum 100392
Label 31
not over 319.8
sum 100708
Label 32
not over 789.0
sum 101496
Label 33
not over 60.599999999999994
sum 101513
Label 34
not over 474.0
sum 101975
Label 35
not over 1441.2
%% Cell type:code id: tags:
```
python
merge_set
=
pd
.
merge
(
df
,
train_set
,
how
=
'
outer
'
,
indicator
=
True
)
reduced
=
merge_set
.
loc
[
merge_set
.
_merge
==
'
left_only
'
,
[
'
Relation
'
,
'
Label
'
,
'
NC
'
,
'
Sentence
'
]]
print
(
reduced
.
head
)
```
%% Output
<bound method NDFrame.head of Relation Label NC \
1173 ADJ-LIKE_NOUN 1 mass destruction
1174 ADJ-LIKE_NOUN 1 mass destruction
1175 ADJ-LIKE_NOUN 1 mass destruction
1176 ADJ-LIKE_NOUN 1 mass destruction
1177 ADJ-LIKE_NOUN 1 mass destruction
... ... ... ...
173965 WHOLE+PART_OR_MEMBER_OF 35 wing tip
173966 WHOLE+PART_OR_MEMBER_OF 35 wing tip
173967 WHOLE+PART_OR_MEMBER_OF 35 wing tip
173968 WHOLE+PART_OR_MEMBER_OF 35 wing tip
173969 WHOLE+PART_OR_MEMBER_OF 35 wing tip
Sentence
1173 one used a primitive revolver; the other a wea...
1174 american obligations to come to israel’s defen...
1175 archbishop jose h. gomez of los angeles in a j...
1176 as robert draper recently us, those in the adm...
1177 didn’t powell say that iraq had ‘weapons of ma...
... ...
173965 snip out the bird’s backbone and add it to the...
173966 the arrest was made following tip-off received...
173967 the incident occurred when police, including g...
173968 apical meristem or growing tip.
173969 lesser is a smaller bird, with slimmer build, ...
[70748 rows x 4 columns]>
%% Cell type:code id: tags:
```
python
val_set
=
pd
.
DataFrame
(
columns
=
[
'
Relation
'
,
'
Label
'
,
'
NC
'
,
'
Sentence
'
])
grouped_NC
=
reduced
.
groupby
([
'
Label
'
,
'
NC
'
]).
size
().
reset_index
(
name
=
"
Count
"
)
grouped_rel
=
reduced
.
groupby
([
'
Label
'
]).
size
().
reset_index
(
name
=
"
Count_rel
"
)
sum
=
0
# iterate over relations to get amount of sentences from each relation
for
index
,
row
in
grouped_rel
.
iterrows
():
count
=
row
[
'
Count_rel
'
]
amount
=
count
*
0.15
print
(
f
"
sum
{
sum
}
"
)
print
(
"
Label {}
"
.
format
(
row
[
'
Label
'
]))
print
(
f
"
not over
{
amount
}
"
)
# iterate over NC of relation to get sentences and dont split up NC in different sets
# amount of sentences should stay below amount calculated above
if
sum
<
perc_val
:
sum2
=
0
# get only nc from relevant relation
NC_rel
=
grouped_NC
.
where
(
grouped_NC
[
'
Label
'
]
==
row
[
'
Label
'
]).
dropna
().
reset_index
(
drop
=
True
)
for
index2
,
row2
in
NC_rel
.
iterrows
():
count2
=
row2
[
'
Count
'
]
temp
=
sum2
+
count2
if
temp
<
amount
and
sum
+
sum2
<
perc_val
:
df1
=
reduced
[
reduced
[
'
NC
'
].
str
.
contains
(
row2
[
'
NC
'
])]
val_set
=
pd
.
concat
([
val_set
,
df1
],
axis
=
0
)
sum2
+=
count2
sum
=
len
(
val_set
)
val_set
.
to_csv
(
f
'
./val_data/val_set.csv
'
)
```
%% Output
sum 0
Label 1
not over 123.0
sum 116
Label 2
not over 110.55
sum 224
Label 3
not over 93.14999999999999
sum 317
Label 4
not over 463.79999999999995
sum 780
Label 5
not over 159.0
sum 937
Label 6
not over 343.8
sum 1280
Label 7
not over 613.35
sum 1893
Label 8
not over 29.849999999999998
sum 1919
Label 9
not over 11.25
sum 1919
Label 10
not over 120.14999999999999
sum 2036
Label 11
not over 595.9499999999999
sum 2631
Label 12
not over 172.95
sum 2803
Label 13
not over 603.75
sum 3405
Label 14
not over 73.2
sum 3477
Label 15
not over 1585.35
sum 5062
Label 16
not over 127.19999999999999
sum 5180
Label 17
not over 166.04999999999998
sum 5346
Label 18
not over 455.7
sum 5801
Label 19
not over 252.14999999999998
sum 6052
Label 20
not over 167.4
sum 6216
Label 21
not over 39.0
sum 6251
Label 22
not over 1356.1499999999999
sum 7607
Label 23
not over 252.14999999999998
sum 7859
Label 24
not over 662.85
sum 8521
Label 25
not over 376.34999999999997
sum 8897
Label 26
not over 196.04999999999998
sum 9093
Label 27
not over 291.75
sum 9382
Label 28
not over 82.35
sum 9460
Label 29
not over 710.1
sum 10170
Label 30
not over 30.0
sum 10170
Label 31
not over 32.55
sum 10200
Label 32
not over 79.05
sum 10279
Label 33
not over 12.6
sum 10279
Label 34
not over 49.199999999999996
sum 10279
Label 35
not over 174.45
%% Cell type:code id: tags:
```
python
merge_set2
=
pd
.
merge
(
reduced
,
val_set
,
how
=
'
outer
'
,
indicator
=
True
)
test_set
=
merge_set2
.
loc
[
merge_set2
.
_merge
==
'
left_only
'
,
[
'
Relation
'
,
'
Label
'
,
'
NC
'
,
'
Sentence
'
]]
print
(
test_set
.
head
)
test_set
.
to_csv
(
f
'
./test_data/test_set.csv
'
)
```
%% Output
<bound method NDFrame.head of Relation Label NC \
116 ADJ-LIKE_NOUN 1 mass extinction
117 ADJ-LIKE_NOUN 1 mass extinction
118 ADJ-LIKE_NOUN 1 mass extinction
119 ADJ-LIKE_NOUN 1 mass extinction
120 ADJ-LIKE_NOUN 1 mass extinction
... ... ... ...
70743 WHOLE+PART_OR_MEMBER_OF 35 wing tip
70744 WHOLE+PART_OR_MEMBER_OF 35 wing tip
70745 WHOLE+PART_OR_MEMBER_OF 35 wing tip
70746 WHOLE+PART_OR_MEMBER_OF 35 wing tip
70747 WHOLE+PART_OR_MEMBER_OF 35 wing tip
Sentence
116 earth is at the start of a sixth mass extincti...
117 the heisei era indicates that godzilla was a d...
118 the impact would have thrown trillions of tons...
119 across england and wales, towns and villages a...
120 geographically widespread organisms fare bette...
... ...
70743 snip out the bird’s backbone and add it to the...
70744 the arrest was made following tip-off received...
70745 the incident occurred when police, including g...
70746 apical meristem or growing tip.
70747 lesser is a smaller bird, with slimmer build, ...
[60295 rows x 4 columns]>
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment