diff --git a/cli.py b/cli.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..5e8ff5693ff1a952287649375739b16d32db2ad9 100644 --- a/cli.py +++ b/cli.py @@ -0,0 +1,254 @@ +import click +import sys +import os + +@click.group() +@click.pass_context +def main(ctx): + pass + +@main.command() +@click.option( + '--input', '-i', type=str, help='The path to the pickled networkx graph.', default='data/wordnet/graphs/wn_graph.pkl', show_default=True, +) +@click.option( + '--output', '-o', type=str, help='The output directory.', default='data/wordnet/', show_default=True, +) +@click.option( + '--epochs', '-n', type=int, help='The number of epochs.', default='200', show_default=True, +) +@click.option( + '--margin', '-m', type=int, help='The margin of the ranking loss function.', default='20', show_default=True, +) +@click.option( + '--batch_size', '-b', type=int, help='The minibatch size.', default='128', show_default=True, +) +@click.option( + '--emb_size', '-e', type=int, help='The size of the learnt embeddings', default='128', show_default=True, +) + +@click.option( + '--rate', '-t', type=float, help='The starting learning rate of the Adam Optimizer.', default='0.001', show_default=True, +) +@click.option( + '--skip', '-s', multiple=True, default=[], show_default=True, help='Label types to skip during training.', +) +@click.pass_context +def embedding_propagation(ctx, batch_size, epochs, margin, emb_size, rate, input, output, skip): + """ + Trains the EP-SP algorithm (reimplementation of "Learning graph representations with embedding propagation", Duran and Niepert, 2017) + with the given input graph and hyperparameters. + """ + from scripts.embedding_propagation import epsp + + if not os.path.exists(output): + os.mkdir(output) + + ep = epsp.EmbeddingPropagation(margin=margin, epochs=epochs, batch_size=batch_size, learning_rate=rate, + embedding_size=emb_size, filename=input, output_dir=output, skip=skip) + ep.train() + + + +@main.command() +@click.option( + '--graph', '-g', type=str, help='The path to the Cora graph, output by the cli.py cora command.', default='data/cora/graphs/cora_graph.pkl', show_default=True, +) +@click.option( + '--embeddings', '-e', type=str, help='The path to the concatenated Cora node embeddings.', default='data/cora/embeddings/merged_node_embeddings.pkl', show_default=True, +) +@click.option( + '--seed', '-s', type=int, help='The random seed for the experiment, != 0.', default=0, show_default=True, +) +@click.option( + '--iterations', '-i', type=int, help='The number of runs. Each run has a different random seed. Ignored if the -s option is used.', default='10', show_default=True, +) +@click.option( + '--num_instances', '-n', type=int, help='The number of instances per class for training.', default='20', show_default=True, +) +@click.option( + '--num_test', '-t', type=int, help='The number of random test instances.', default='1000', show_default=True, +) +@click.option( + '--regularization', '-c', type=float, help='Inverse of regularization strength.', default='0.1', show_default=True, +) +@click.pass_context +def node_classification(ctx, graph, embeddings, seed, iterations, num_instances, num_test, regularization): + """ + Runs the node classification experiment described in Section 2.3. + """ + from scripts.node_classification import nc_experiment as nc + + if seed: + nc.node_classification(path_graph=graph, path_embeddings=embeddings, seed=seed, num_per_class=num_instances, C=regularization) + else: + nc.node_classification_random_seeds(path_graph=graph, path_embeddings=embeddings, num_test_instances=num_test, + num_per_class=num_instances, iterations=iterations, C=regularization) + + + + +@main.command() +@click.option( + '--input', '-i', type=str, help='The path to the raw Cora files', default='data/cora/raw/', show_default=True, +) +@click.option( + '--output', '-o', type=str, help='The output path + filename for the pickled graph', default='data/cora/graphs/graph.pkl', show_default=True, +) +@click.pass_context +def process_cora(ctx, input, output): + """ + Creates and pickles the Cora graph. + """ + from scripts.preprocessing.cora import cora + + path_nodes = "{}/cora.content".format(input) + path_edges = "{}/cora.cites".format(input) + + cora.write_pickle_graph_file(path_nodes=path_nodes, path_edges=path_edges, output_path=output) + +@main.command() +@click.option( + '--senseval', '-s', type=click.Choice(['2', '3']), help='Whether to run the experiment on SensEval 2 or 3.', default='3', show_default=True, +) +@click.option( + '--embeddings', '-e', type=str, help='Embeddings file to load.', default='data/wordnet/embeddings/epoch_500/wsd_node_embeddings.pkl', show_default=True, +) +@click.option( + '--id_map', '-i', type=str, help='The path to the .json id mapping.', default='data/wordnet/mappings/json/id_mapping.json', show_default=True, +) +@click.option( + '--lemma_map', '-l', type=str, help='The path to the .txt lemma mapping.', default='data/wordnet/mappings/txt/lemmata_mapping2.txt', show_default=True, +) +@click.option( + '--sense_key', '-k', type=str, help='The path to the wn30->wn17 mapping.', default='data/wordnet/mappings/txt/wn30_wn17_long-wn17_pos.txt', show_default=True, +) +@click.option( + '--output', '-o', type=str, help='Name of the answer file. No path please.', default='wsd1', show_default=True, +) +@click.pass_context +def wsd_1(ctx, senseval, embeddings, id_map, lemma_map, sense_key, output): + """ + Runs Method 1 of the Word Sense Disambiguation experiment, as described in Section 3.3.2. + """ + from scripts.wsd import wsd_method1 as wsd + + senseval_path = 'data/senseval{}/'.format(senseval) + + input_file = os.path.join(senseval_path, 'processed/ambig_sents.json') + + output_file = '{}wsd_answers/{}'.format(senseval_path, output) + if os.path.exists(output_file): + print('File {} already exists. Please delete old file or give another filename (-o).'.format(output_file)) + sys.exit(1) + + embeddings = wsd.open_embedding_file(embeddings) + + id_mapping= wsd.open_mapping(id_map) + lemmata_mapping = wsd.open_lem_map(lemma_map) + + sense_key_mapping = wsd.open_sense_keys(sense_key) + + solutions = wsd.iterate_over(input_file, embeddings, lemmata_mapping, id_mapping, sense_key_mapping) + wsd.write_answer_to_file(solutions, output_file) + + print('WSD answers saved under', output_file) + + +@main.command() +@click.option( + '--context', '-c', type=int, help='The size of the context window.', default=5, show_default=True, +) +@click.option( + '--senseval', '-s', type=click.Choice(['2', '3']), help='Whether to run the experiment on SensEval 2 or 3.', default='3', show_default=True, +) +@click.option( + '--output', '-o', type=str, help='Name of the answer file. No path please.', default='wsd2', show_default=True, +) +@click.option( + '--mappings', '-m', type=str, help='The path to the pickled WordNet mappings.', default='data/wordnet/mappings/pickled/', show_default=True, +) +@click.option( + '--embeddings', '-e', type=str, help='The path to the pickled WordNet node embeddings.', default='data/wordnet/embeddings/epoch_500/', show_default=True, +) +@click.option( + '--first_sense', '-fs', is_flag=True, help='Whether the "First WordNet Sense" baseline should be used.', +) +@click.option( + '--info', '-i', is_flag=True, help='Whether to print some info on the console.', +) +@click.pass_context +def wsd_2(ctx, context, output, senseval, mappings, embeddings, first_sense, info): + """ + Runs Method 2 of the Word Sense Disambiguation experiment, as described in Section 3.3.2. + """ + from scripts.wsd import wsd_method2 as wsd, wsd2_exp as exp + + senseval_path = 'data/senseval{}/'.format(senseval) + output_file = '{}wsd_answers/{}_{}'.format(senseval_path, output, context) + if os.path.exists(output_file): + print('File {} already exists. Please delete old file or give another filename (-o).'.format(output_file)) + sys.exit(1) + + input_path = senseval_path + 'processed/' + docs = [] + + for file in sorted(os.listdir(input_path)): + if not file.endswith('.pkl'): continue + print('Processing {}...'.format(file)) + docs.append(wsd.read_pkl(os.path.join(input_path, file))) + + + # get all required mappings + id_map = wsd.read_pkl(mappings + '/id_mapping.pkl') + reverse_id_map = wsd.read_pkl(mappings + 'reverse_id.pkl') + gloss_map = wsd.read_pkl(mappings + 'gloss_mapping.pkl') + lemma_map = wsd.read_pkl(mappings + 'lemma_mapping.pkl') + pos_map = wsd.read_pkl(mappings + 'pos_mapping.pkl') + wordnet_map = wsd.read_pkl(mappings + '/pickled_wn_mapping.pkl') + + # get required embeddings + gloss_emb = wsd.read_pkl(embeddings + '/embedding_matrix_gloss') # dummy gloss embeddings + lemma_emb = wsd.read_pkl(embeddings + '/embedding_matrix_lemma') + pos_emb = wsd.read_pkl(embeddings + '/embedding_matrix_pos') + node_emb = wsd.read_pkl(embeddings + '/wsd_node_embeddings.pkl') + + disambiguator = wsd.WSD(id_map, reverse_id_map, gloss_map, gloss_emb, lemma_map, + lemma_emb, pos_map, pos_emb, node_emb, wordnet_map) + + for doc in docs: + exp.run_experiment(window_size=context, document=doc, disambiguator=disambiguator, output_file=output_file, mfs=first_sense, info=info) + + print('WSD answers saved under', output_file) + + +@main.command() +@click.option( + '--answers', '-a', type=str, help='Name of the answer file. No path please.', default='wsd1', show_default=True, +) +@click.option( + '--senseval', '-s', type=click.Choice(['2', '3']), help='Whether to use the SensEval 2 or 3 answer key.', default='3', show_default=True, +) +@click.pass_context +def score_wsd(ctx, answers, senseval): + """ + Calls the official SensEval Scorer on some system output. + """ + import subprocess + + senseval_path = 'data/senseval{}/'.format(senseval) + answers = senseval_path + 'wsd_answers/' + answers + + if senseval == 2: + key = senseval_path + 'raw/key_senseval2' + else: + key = senseval_path + 'raw/EnglishAW.test.key' + + command = ['scripts/scoring/scorer2', answers, key] + subprocess.check_call(command) + + + +if __name__ == "__main__": + main() +