diff --git a/scripts/preprocessing/senseval/preprocess_senseval_method1.py b/scripts/preprocessing/senseval/preprocess_senseval_method1.py index 5ea478ed04e37741e950f363dd23eadd2df989d0..b672f1f07a248a05db320f7234114eddc565c5e3 100644 --- a/scripts/preprocessing/senseval/preprocess_senseval_method1.py +++ b/scripts/preprocessing/senseval/preprocess_senseval_method1.py @@ -146,7 +146,9 @@ def get_lem_pos(word, d_no, sent_no, tb_docs): def preprocess(xml_file, tb_files): """ - + this function opens a senseval file + returns a list containing a list for each document, each containing a list of sentences + containing a dictionary for every word to be disambiguated """ documents = read_xml(xml_file) answer_ids = documents[1] @@ -174,7 +176,7 @@ def preprocess(xml_file, tb_files): lem_pos = get_lem_pos(word,i,j,tb_docs) info['lemma'] = lem_pos[0] info['pos'] = lem_pos[1] - + # connects compound words per underscore if ' sats=' in word: for l, word2 in enumerate(sent): id_m = id_p.search(word2).group()