Add header to readme

2446fb61 · engel · 89e1f55e · 2446fb61 · 89e1f55e
Commit 2446fb61 authored 3 years ago by engel
--- a/temp/README.md
+++ b/temp/README.md
+# Data

+## Table of Contents
+- [Data](#data)
+  - [Table of Contents](#table-of-contents)
 - [Searching for data](#searching-for-data)
  - [Forcing BERTs Attention on the compound](#forcing-berts-attention-on-the-compound)
 - [Limiting NC Occurrences](#limiting-nc-occurrences)

--- a/temp/data_split.ipynb
+++ b/temp/data_split.ipynb
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import random\n",
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.read_csv(\"sentences_fine_200.csv\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "104350.8\n",
-      "26087.7\n",
-      "43479.5\n"
-     ]
-    }
-   ],
-   "source": [
-    "sent_amount = len(df)\n",
-    "perc_train = sent_amount*0.6\n",
-    "perc_test = sent_amount*0.25\n",
-    "perc_val = sent_amount*0.15\n",
-    "print(perc_train)\n",
-    "print(perc_val)\n",
-    "print(perc_test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create different DataFrames for each set\n",
-    "train_set = pd.DataFrame(columns = ['Relation', 'Label', 'NC', 'Sentence'])\n",
-    "val_set = pd.DataFrame(columns = ['Relation', 'Label', 'NC', 'Sentence'])\n",
-    "test_set = pd.DataFrame(columns = ['Relation', 'Label', 'NC', 'Sentence'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "sum 0\n",
-      "Label 1\n",
-      " not over 1210.2\n",
-      "sum 1197\n",
-      "Label 2\n",
-      " not over 1104.6\n",
-      "sum 2301\n",
-      "Label 3\n",
-      " not over 930.5999999999999\n",
-      "sum 3231\n",
-      "Label 4\n",
-      " not over 4637.4\n",
-      "sum 7868\n",
-      "Label 5\n",
-      " not over 1587.0\n",
-      "sum 9453\n",
-      "Label 6\n",
-      " not over 3437.4\n",
-      "sum 12890\n",
-      "Label 7\n",
-      " not over 6597.599999999999\n",
-      "sum 19797\n",
-      "Label 8\n",
-      " not over 297.59999999999997\n",
-      "sum 20094\n",
-      "Label 9\n",
-      " not over 103.8\n",
-      "sum 20192\n",
-      "Label 10\n",
-      " not over 1199.3999999999999\n",
-      "sum 21390\n",
-      "Label 11\n",
-      " not over 5962.8\n",
-      "sum 27352\n",
-      "Label 12\n",
-      " not over 1728.0\n",
-      "sum 29079\n",
-      "Label 13\n",
-      " not over 6036.599999999999\n",
-      "sum 35115\n",
-      "Label 14\n",
-      " not over 731.4\n",
-      "sum 35846\n",
-      "Label 15\n",
-      " not over 15852.0\n",
-      "sum 51706\n",
-      "Label 16\n",
-      " not over 1271.3999999999999\n",
-      "sum 52977\n",
-      "Label 17\n",
-      " not over 1659.6\n",
-      "sum 54732\n",
-      "Label 18\n",
-      " not over 4556.4\n",
-      "sum 59292\n",
-      "Label 19\n",
-      " not over 2518.7999999999997\n",
-      "sum 61824\n",
-      "Label 20\n",
-      " not over 1667.3999999999999\n",
-      "sum 63489\n",
-      "Label 21\n",
-      " not over 385.2\n",
-      "sum 63871\n",
-      "Label 22\n",
-      " not over 13704.0\n",
-      "sum 77574\n",
-      "Label 23\n",
-      " not over 2521.2\n",
-      "sum 80098\n",
-      "Label 24\n",
-      " not over 6627.599999999999\n",
-      "sum 86737\n",
-      "Label 25\n",
-      " not over 3777.0\n",
-      "sum 90523\n",
-      "Label 26\n",
-      " not over 1959.6\n",
-      "sum 92483\n",
-      "Label 27\n",
-      " not over 2916.6\n",
-      "sum 95400\n",
-      "Label 28\n",
-      " not over 819.6\n",
-      "sum 96217\n",
-      "Label 29\n",
-      " not over 5308.8\n",
-      "sum 100331\n",
-      "Label 30\n",
-      " not over 156.6\n",
-      "sum 100392\n",
-      "Label 31\n",
-      " not over 319.8\n",
-      "sum 100708\n",
-      "Label 32\n",
-      " not over 789.0\n",
-      "sum 101496\n",
-      "Label 33\n",
-      " not over 60.599999999999994\n",
-      "sum 101513\n",
-      "Label 34\n",
-      " not over 474.0\n",
-      "sum 101975\n",
-      "Label 35\n",
-      " not over 1441.2\n"
-     ]
-    }
-   ],
-   "source": [
-    "grouped_NC = df.groupby(['Label', 'NC']).size().reset_index(name=\"Count\")\n",
-    "grouped_rel = df.groupby(['Label']).size().reset_index(name=\"Count_rel\")\n",
-    "\n",
-    "sum = 0\n",
-    "# iterate over relations to get amount of sentences from each relation\n",
-    "for index, row in grouped_rel.iterrows():\n",
-    "    count = row['Count_rel']\n",
-    "    amount = count*0.6\n",
-    "    print(f\"sum {sum}\")\n",
-    "    print(\"Label {}\".format(row['Label']))\n",
-    "    print(f\" not over {amount}\")\n",
-    "    # iterate over NC of relation to get sentences and dont split up NC in different sets\n",
-    "    # amount of sentences should stay below amount calculated above\n",
-    "    if sum < perc_train:\n",
-    "        sum2 = 0\n",
-    "\n",
-    "        # get only nc from relevant relation\n",
-    "        NC_rel = grouped_NC.where(grouped_NC['Label']== row['Label']).dropna().reset_index(drop=True)\n",
-    "        for index2, row2 in NC_rel.iterrows():\n",
-    "            count2 = row2['Count']\n",
-    "            temp = sum2 + count2\n",
-    "            if temp < amount and sum+sum2 < perc_train:\n",
-    "                df1 = df[df['NC'].str.contains(row2['NC'])]\n",
-    "                train_set = pd.concat([train_set, df1], axis=0)\n",
-    "                sum2 += count2\n",
-    "            \n",
-    "            sum = len(train_set)\n",
-    "    train_set.to_csv(f'./train_data/train_set.csv')          \n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<bound method NDFrame.head of                        Relation Label                NC  \\\n",
-      "1173              ADJ-LIKE_NOUN     1  mass destruction   \n",
-      "1174              ADJ-LIKE_NOUN     1  mass destruction   \n",
-      "1175              ADJ-LIKE_NOUN     1  mass destruction   \n",
-      "1176              ADJ-LIKE_NOUN     1  mass destruction   \n",
-      "1177              ADJ-LIKE_NOUN     1  mass destruction   \n",
-      "...                         ...   ...               ...   \n",
-      "173965  WHOLE+PART_OR_MEMBER_OF    35          wing tip   \n",
-      "173966  WHOLE+PART_OR_MEMBER_OF    35          wing tip   \n",
-      "173967  WHOLE+PART_OR_MEMBER_OF    35          wing tip   \n",
-      "173968  WHOLE+PART_OR_MEMBER_OF    35          wing tip   \n",
-      "173969  WHOLE+PART_OR_MEMBER_OF    35          wing tip   \n",
-      "\n",
-      "                                                 Sentence  \n",
-      "1173    one used a primitive revolver; the other a wea...  \n",
-      "1174    american obligations to come to israel’s defen...  \n",
-      "1175    archbishop jose h. gomez of los angeles in a j...  \n",
-      "1176    as robert draper recently us, those in the adm...  \n",
-      "1177    didn’t powell say that iraq had ‘weapons of ma...  \n",
-      "...                                                   ...  \n",
-      "173965  snip out the bird’s backbone and add it to the...  \n",
-      "173966  the arrest was made following tip-off received...  \n",
-      "173967  the incident occurred when police, including g...  \n",
-      "173968                    apical meristem or growing tip.  \n",
-      "173969  lesser is a smaller bird, with slimmer build, ...  \n",
-      "\n",
-      "[70748 rows x 4 columns]>\n"
-     ]
-    }
-   ],
-   "source": [
-    "merge_set = pd.merge(df, train_set, how='outer', indicator=True)\n",
-    "reduced = merge_set.loc[merge_set._merge == 'left_only', ['Relation','Label','NC','Sentence']]\n",
-    "print(reduced.head)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "sum 0\n",
-      "Label 1\n",
-      " not over 123.0\n",
-      "sum 116\n",
-      "Label 2\n",
-      " not over 110.55\n",
-      "sum 224\n",
-      "Label 3\n",
-      " not over 93.14999999999999\n",
-      "sum 317\n",
-      "Label 4\n",
-      " not over 463.79999999999995\n",
-      "sum 780\n",
-      "Label 5\n",
-      " not over 159.0\n",
-      "sum 937\n",
-      "Label 6\n",
-      " not over 343.8\n",
-      "sum 1280\n",
-      "Label 7\n",
-      " not over 613.35\n",
-      "sum 1893\n",
-      "Label 8\n",
-      " not over 29.849999999999998\n",
-      "sum 1919\n",
-      "Label 9\n",
-      " not over 11.25\n",
-      "sum 1919\n",
-      "Label 10\n",
-      " not over 120.14999999999999\n",
-      "sum 2036\n",
-      "Label 11\n",
-      " not over 595.9499999999999\n",
-      "sum 2631\n",
-      "Label 12\n",
-      " not over 172.95\n",
-      "sum 2803\n",
-      "Label 13\n",
-      " not over 603.75\n",
-      "sum 3405\n",
-      "Label 14\n",
-      " not over 73.2\n",
-      "sum 3477\n",
-      "Label 15\n",
-      " not over 1585.35\n",
-      "sum 5062\n",
-      "Label 16\n",
-      " not over 127.19999999999999\n",
-      "sum 5180\n",
-      "Label 17\n",
-      " not over 166.04999999999998\n",
-      "sum 5346\n",
-      "Label 18\n",
-      " not over 455.7\n",
-      "sum 5801\n",
-      "Label 19\n",
-      " not over 252.14999999999998\n",
-      "sum 6052\n",
-      "Label 20\n",
-      " not over 167.4\n",
-      "sum 6216\n",
-      "Label 21\n",
-      " not over 39.0\n",
-      "sum 6251\n",
-      "Label 22\n",
-      " not over 1356.1499999999999\n",
-      "sum 7607\n",
-      "Label 23\n",
-      " not over 252.14999999999998\n",
-      "sum 7859\n",
-      "Label 24\n",
-      " not over 662.85\n",
-      "sum 8521\n",
-      "Label 25\n",
-      " not over 376.34999999999997\n",
-      "sum 8897\n",
-      "Label 26\n",
-      " not over 196.04999999999998\n",
-      "sum 9093\n",
-      "Label 27\n",
-      " not over 291.75\n",
-      "sum 9382\n",
-      "Label 28\n",
-      " not over 82.35\n",
-      "sum 9460\n",
-      "Label 29\n",
-      " not over 710.1\n",
-      "sum 10170\n",
-      "Label 30\n",
-      " not over 30.0\n",
-      "sum 10170\n",
-      "Label 31\n",
-      " not over 32.55\n",
-      "sum 10200\n",
-      "Label 32\n",
-      " not over 79.05\n",
-      "sum 10279\n",
-      "Label 33\n",
-      " not over 12.6\n",
-      "sum 10279\n",
-      "Label 34\n",
-      " not over 49.199999999999996\n",
-      "sum 10279\n",
-      "Label 35\n",
-      " not over 174.45\n"
-     ]
-    }
-   ],
-   "source": [
-    "val_set = pd.DataFrame(columns = ['Relation', 'Label', 'NC', 'Sentence'])\n",
-    "grouped_NC = reduced.groupby(['Label', 'NC']).size().reset_index(name=\"Count\")\n",
-    "grouped_rel = reduced.groupby(['Label']).size().reset_index(name=\"Count_rel\")\n",
-    "\n",
-    "sum = 0\n",
-    "# iterate over relations to get amount of sentences from each relation\n",
-    "for index, row in grouped_rel.iterrows():\n",
-    "    count = row['Count_rel']\n",
-    "    amount = count*0.15\n",
-    "    print(f\"sum {sum}\")\n",
-    "    print(\"Label {}\".format(row['Label']))\n",
-    "    print(f\" not over {amount}\")\n",
-    "\n",
-    "    # iterate over NC of relation to get sentences and dont split up NC in different sets\n",
-    "    # amount of sentences should stay below amount calculated above\n",
-    "    if sum < perc_val:\n",
-    "        sum2 = 0\n",
-    "        # get only nc from relevant relation\n",
-    "        NC_rel = grouped_NC.where(grouped_NC['Label']== row['Label']).dropna().reset_index(drop=True)\n",
-    "        for index2, row2 in NC_rel.iterrows():\n",
-    "            count2 = row2['Count']\n",
-    "            temp = sum2 + count2\n",
-    "            if temp < amount and sum+sum2 < perc_val:\n",
-    "                df1 = reduced[reduced['NC'].str.contains(row2['NC'])]\n",
-    "                val_set = pd.concat([val_set, df1], axis=0)\n",
-    "                sum2 += count2\n",
-    "            sum = len(val_set)\n",
-    "    val_set.to_csv(f'./val_data/val_set.csv')  "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "<bound method NDFrame.head of                       Relation Label               NC  \\\n",
-      "116              ADJ-LIKE_NOUN     1  mass extinction   \n",
-      "117              ADJ-LIKE_NOUN     1  mass extinction   \n",
-      "118              ADJ-LIKE_NOUN     1  mass extinction   \n",
-      "119              ADJ-LIKE_NOUN     1  mass extinction   \n",
-      "120              ADJ-LIKE_NOUN     1  mass extinction   \n",
-      "...                        ...   ...              ...   \n",
-      "70743  WHOLE+PART_OR_MEMBER_OF    35         wing tip   \n",
-      "70744  WHOLE+PART_OR_MEMBER_OF    35         wing tip   \n",
-      "70745  WHOLE+PART_OR_MEMBER_OF    35         wing tip   \n",
-      "70746  WHOLE+PART_OR_MEMBER_OF    35         wing tip   \n",
-      "70747  WHOLE+PART_OR_MEMBER_OF    35         wing tip   \n",
-      "\n",
-      "                                                Sentence  \n",
-      "116    earth is at the start of a sixth mass extincti...  \n",
-      "117    the heisei era indicates that godzilla was a d...  \n",
-      "118    the impact would have thrown trillions of tons...  \n",
-      "119    across england and wales, towns and villages a...  \n",
-      "120    geographically widespread organisms fare bette...  \n",
-      "...                                                  ...  \n",
-      "70743  snip out the bird’s backbone and add it to the...  \n",
-      "70744  the arrest was made following tip-off received...  \n",
-      "70745  the incident occurred when police, including g...  \n",
-      "70746                    apical meristem or growing tip.  \n",
-      "70747  lesser is a smaller bird, with slimmer build, ...  \n",
-      "\n",
-      "[60295 rows x 4 columns]>\n"
-     ]
-    }
-   ],
-   "source": [
-    "merge_set2 = pd.merge(reduced, val_set, how='outer', indicator=True)\n",
-    "test_set = merge_set2.loc[merge_set2._merge == 'left_only', ['Relation','Label','NC','Sentence']]\n",
-    "print(test_set.head)\n",
-    "\n",
-    "test_set.to_csv(f'./test_data/test_set.csv') "
-   ]
-  }
- ],
- "metadata": {
-  "interpreter": {
-   "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
-  },
-  "kernelspec": {
-   "display_name": "Python 3.7.3 64-bit",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
-%% Cell type:code id: tags:
-
-``` python
-import random
-import pandas as pd
-```
-
-%% Cell type:code id: tags:
-
-``` python
-df = pd.read_csv("sentences_fine_200.csv")
-```
-
-%% Cell type:code id: tags:
-
-``` python
-sent_amount = len(df)
-perc_train = sent_amount*0.6
-perc_test = sent_amount*0.25
-perc_val = sent_amount*0.15
-print(perc_train)
-print(perc_val)
-print(perc_test)
-```
-
-%% Output
-
-    104350.8
-    26087.7
-    43479.5
-
-%% Cell type:code id: tags:
-
-``` python
-# Create different DataFrames for each set
-train_set = pd.DataFrame(columns = ['Relation', 'Label', 'NC', 'Sentence'])
-val_set = pd.DataFrame(columns = ['Relation', 'Label', 'NC', 'Sentence'])
-test_set = pd.DataFrame(columns = ['Relation', 'Label', 'NC', 'Sentence'])
-```
-
-%% Cell type:code id: tags:
-
-``` python
-grouped_NC = df.groupby(['Label', 'NC']).size().reset_index(name="Count")
-grouped_rel = df.groupby(['Label']).size().reset_index(name="Count_rel")
-
-sum = 0
-# iterate over relations to get amount of sentences from each relation
-for index, row in grouped_rel.iterrows():
-    count = row['Count_rel']
-    amount = count*0.6
-    print(f"sum {sum}")
-    print("Label {}".format(row['Label']))
-    print(f" not over {amount}")
-    # iterate over NC of relation to get sentences and dont split up NC in different sets
-    # amount of sentences should stay below amount calculated above
-    if sum < perc_train:
-        sum2 = 0
-
-        # get only nc from relevant relation
-        NC_rel = grouped_NC.where(grouped_NC['Label']== row['Label']).dropna().reset_index(drop=True)
-        for index2, row2 in NC_rel.iterrows():
-            count2 = row2['Count']
-            temp = sum2 + count2
-            if temp < amount and sum+sum2 < perc_train:
-                df1 = df[df['NC'].str.contains(row2['NC'])]
-                train_set = pd.concat([train_set, df1], axis=0)
-                sum2 += count2
-
-            sum = len(train_set)
-    train_set.to_csv(f'./train_data/train_set.csv')
-
-```
-
-%% Output
-
-    sum 0
-    Label 1
-     not over 1210.2
-    sum 1197
-    Label 2
-     not over 1104.6
-    sum 2301
-    Label 3
-     not over 930.5999999999999
-    sum 3231
-    Label 4
-     not over 4637.4
-    sum 7868
-    Label 5
-     not over 1587.0
-    sum 9453
-    Label 6
-     not over 3437.4
-    sum 12890
-    Label 7
-     not over 6597.599999999999
-    sum 19797
-    Label 8
-     not over 297.59999999999997
-    sum 20094
-    Label 9
-     not over 103.8
-    sum 20192
-    Label 10
-     not over 1199.3999999999999
-    sum 21390
-    Label 11
-     not over 5962.8
-    sum 27352
-    Label 12
-     not over 1728.0
-    sum 29079
-    Label 13
-     not over 6036.599999999999
-    sum 35115
-    Label 14
-     not over 731.4
-    sum 35846
-    Label 15
-     not over 15852.0
-    sum 51706
-    Label 16
-     not over 1271.3999999999999
-    sum 52977
-    Label 17
-     not over 1659.6
-    sum 54732
-    Label 18
-     not over 4556.4
-    sum 59292
-    Label 19
-     not over 2518.7999999999997
-    sum 61824
-    Label 20
-     not over 1667.3999999999999
-    sum 63489
-    Label 21
-     not over 385.2
-    sum 63871
-    Label 22
-     not over 13704.0
-    sum 77574
-    Label 23
-     not over 2521.2
-    sum 80098
-    Label 24
-     not over 6627.599999999999
-    sum 86737
-    Label 25
-     not over 3777.0
-    sum 90523
-    Label 26
-     not over 1959.6
-    sum 92483
-    Label 27
-     not over 2916.6
-    sum 95400
-    Label 28
-     not over 819.6
-    sum 96217
-    Label 29
-     not over 5308.8
-    sum 100331
-    Label 30
-     not over 156.6
-    sum 100392
-    Label 31
-     not over 319.8
-    sum 100708
-    Label 32
-     not over 789.0
-    sum 101496
-    Label 33
-     not over 60.599999999999994
-    sum 101513
-    Label 34
-     not over 474.0
-    sum 101975
-    Label 35
-     not over 1441.2
-
-%% Cell type:code id: tags:
-
-``` python
-merge_set = pd.merge(df, train_set, how='outer', indicator=True)
-reduced = merge_set.loc[merge_set._merge == 'left_only', ['Relation','Label','NC','Sentence']]
-print(reduced.head)
-```
-
-%% Output
-
-    <bound method NDFrame.head of                        Relation Label                NC  \
-    1173              ADJ-LIKE_NOUN     1  mass destruction
-    1174              ADJ-LIKE_NOUN     1  mass destruction
-    1175              ADJ-LIKE_NOUN     1  mass destruction
-    1176              ADJ-LIKE_NOUN     1  mass destruction
-    1177              ADJ-LIKE_NOUN     1  mass destruction
-    ...                         ...   ...               ...
-    173965  WHOLE+PART_OR_MEMBER_OF    35          wing tip
-    173966  WHOLE+PART_OR_MEMBER_OF    35          wing tip
-    173967  WHOLE+PART_OR_MEMBER_OF    35          wing tip
-    173968  WHOLE+PART_OR_MEMBER_OF    35          wing tip
-    173969  WHOLE+PART_OR_MEMBER_OF    35          wing tip
-    
-                                                     Sentence
-    1173    one used a primitive revolver; the other a wea...
-    1174    american obligations to come to israel’s defen...
-    1175    archbishop jose h. gomez of los angeles in a j...
-    1176    as robert draper recently us, those in the adm...
-    1177    didn’t powell say that iraq had ‘weapons of ma...
-    ...                                                   ...
-    173965  snip out the bird’s backbone and add it to the...
-    173966  the arrest was made following tip-off received...
-    173967  the incident occurred when police, including g...
-    173968                    apical meristem or growing tip.
-    173969  lesser is a smaller bird, with slimmer build, ...
-    
-    [70748 rows x 4 columns]>
-
-%% Cell type:code id: tags:
-
-``` python
-val_set = pd.DataFrame(columns = ['Relation', 'Label', 'NC', 'Sentence'])
-grouped_NC = reduced.groupby(['Label', 'NC']).size().reset_index(name="Count")
-grouped_rel = reduced.groupby(['Label']).size().reset_index(name="Count_rel")
-
-sum = 0
-# iterate over relations to get amount of sentences from each relation
-for index, row in grouped_rel.iterrows():
-    count = row['Count_rel']
-    amount = count*0.15
-    print(f"sum {sum}")
-    print("Label {}".format(row['Label']))
-    print(f" not over {amount}")
-
-    # iterate over NC of relation to get sentences and dont split up NC in different sets
-    # amount of sentences should stay below amount calculated above
-    if sum < perc_val:
-        sum2 = 0
-        # get only nc from relevant relation
-        NC_rel = grouped_NC.where(grouped_NC['Label']== row['Label']).dropna().reset_index(drop=True)
-        for index2, row2 in NC_rel.iterrows():
-            count2 = row2['Count']
-            temp = sum2 + count2
-            if temp < amount and sum+sum2 < perc_val:
-                df1 = reduced[reduced['NC'].str.contains(row2['NC'])]
-                val_set = pd.concat([val_set, df1], axis=0)
-                sum2 += count2
-            sum = len(val_set)
-    val_set.to_csv(f'./val_data/val_set.csv')
-```
-
-%% Output
-
-    sum 0
-    Label 1
-     not over 123.0
-    sum 116
-    Label 2
-     not over 110.55
-    sum 224
-    Label 3
-     not over 93.14999999999999
-    sum 317
-    Label 4
-     not over 463.79999999999995
-    sum 780
-    Label 5
-     not over 159.0
-    sum 937
-    Label 6
-     not over 343.8
-    sum 1280
-    Label 7
-     not over 613.35
-    sum 1893
-    Label 8
-     not over 29.849999999999998
-    sum 1919
-    Label 9
-     not over 11.25
-    sum 1919
-    Label 10
-     not over 120.14999999999999
-    sum 2036
-    Label 11
-     not over 595.9499999999999
-    sum 2631
-    Label 12
-     not over 172.95
-    sum 2803
-    Label 13
-     not over 603.75
-    sum 3405
-    Label 14
-     not over 73.2
-    sum 3477
-    Label 15
-     not over 1585.35
-    sum 5062
-    Label 16
-     not over 127.19999999999999
-    sum 5180
-    Label 17
-     not over 166.04999999999998
-    sum 5346
-    Label 18
-     not over 455.7
-    sum 5801
-    Label 19
-     not over 252.14999999999998
-    sum 6052
-    Label 20
-     not over 167.4
-    sum 6216
-    Label 21
-     not over 39.0
-    sum 6251
-    Label 22
-     not over 1356.1499999999999
-    sum 7607
-    Label 23
-     not over 252.14999999999998
-    sum 7859
-    Label 24
-     not over 662.85
-    sum 8521
-    Label 25
-     not over 376.34999999999997
-    sum 8897
-    Label 26
-     not over 196.04999999999998
-    sum 9093
-    Label 27
-     not over 291.75
-    sum 9382
-    Label 28
-     not over 82.35
-    sum 9460
-    Label 29
-     not over 710.1
-    sum 10170
-    Label 30
-     not over 30.0
-    sum 10170
-    Label 31
-     not over 32.55
-    sum 10200
-    Label 32
-     not over 79.05
-    sum 10279
-    Label 33
-     not over 12.6
-    sum 10279
-    Label 34
-     not over 49.199999999999996
-    sum 10279
-    Label 35
-     not over 174.45
-
-%% Cell type:code id: tags:
-
-``` python
-merge_set2 = pd.merge(reduced, val_set, how='outer', indicator=True)
-test_set = merge_set2.loc[merge_set2._merge == 'left_only', ['Relation','Label','NC','Sentence']]
-print(test_set.head)
-
-test_set.to_csv(f'./test_data/test_set.csv')
-```
-
-%% Output
-
-    <bound method NDFrame.head of                       Relation Label               NC  \
-    116              ADJ-LIKE_NOUN     1  mass extinction
-    117              ADJ-LIKE_NOUN     1  mass extinction
-    118              ADJ-LIKE_NOUN     1  mass extinction
-    119              ADJ-LIKE_NOUN     1  mass extinction
-    120              ADJ-LIKE_NOUN     1  mass extinction
-    ...                        ...   ...              ...
-    70743  WHOLE+PART_OR_MEMBER_OF    35         wing tip
-    70744  WHOLE+PART_OR_MEMBER_OF    35         wing tip
-    70745  WHOLE+PART_OR_MEMBER_OF    35         wing tip
-    70746  WHOLE+PART_OR_MEMBER_OF    35         wing tip
-    70747  WHOLE+PART_OR_MEMBER_OF    35         wing tip
-    
-                                                    Sentence
-    116    earth is at the start of a sixth mass extincti...
-    117    the heisei era indicates that godzilla was a d...
-    118    the impact would have thrown trillions of tons...
-    119    across england and wales, towns and villages a...
-    120    geographically widespread organisms fare bette...
-    ...                                                  ...
-    70743  snip out the bird’s backbone and add it to the...
-    70744  the arrest was made following tip-off received...
-    70745  the incident occurred when police, including g...
-    70746                    apical meristem or growing tip.
-    70747  lesser is a smaller bird, with slimmer build, ...
-    
-    [60295 rows x 4 columns]>