From 4f35e6520273a4b846b06a00e9b273e7a5eceb8f Mon Sep 17 00:00:00 2001 From: Nadia <nwarslan@cl.uni-heidelberg.de> Date: Wed, 27 Feb 2019 23:36:15 +0100 Subject: [PATCH] =?UTF-8?q?Kommentare=20hinzugef=C3=BCgt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../preprocessing/senseval/preprocess_senseval_method1.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/preprocessing/senseval/preprocess_senseval_method1.py b/scripts/preprocessing/senseval/preprocess_senseval_method1.py index 5ea478e..b672f1f 100644 --- a/scripts/preprocessing/senseval/preprocess_senseval_method1.py +++ b/scripts/preprocessing/senseval/preprocess_senseval_method1.py @@ -146,7 +146,9 @@ def get_lem_pos(word, d_no, sent_no, tb_docs): def preprocess(xml_file, tb_files): """ - + this function opens a senseval file + returns a list containing a list for each document, each containing a list of sentences + containing a dictionary for every word to be disambiguated """ documents = read_xml(xml_file) answer_ids = documents[1] @@ -174,7 +176,7 @@ def preprocess(xml_file, tb_files): lem_pos = get_lem_pos(word,i,j,tb_docs) info['lemma'] = lem_pos[0] info['pos'] = lem_pos[1] - + # connects compound words per underscore if ' sats=' in word: for l, word2 in enumerate(sent): id_m = id_p.search(word2).group() -- GitLab