added CLI for new project structure

cacea6a9 · dimitrova · b40d1a06 · cacea6a9
Commit cacea6a9 authored 6 years ago by dimitrova
--- a/cli.py
+++ b/cli.py
+import click
+import sys
+import os
+
+@click.group()
+@click.pass_context
+def main(ctx):
+	pass
+
+@main.command()
+@click.option(
+	'--input', '-i', type=str, help='The path to the pickled networkx graph.', default='data/wordnet/graphs/wn_graph.pkl', show_default=True,
+)
+@click.option(
+	'--output', '-o', type=str, help='The output directory.', default='data/wordnet/', show_default=True,
+)
+@click.option(
+	'--epochs', '-n', type=int, help='The number of epochs.', default='200', show_default=True,
+)
+@click.option(
+	'--margin', '-m', type=int, help='The margin of the ranking loss function.', default='20', show_default=True,
+)
+@click.option(
+	'--batch_size', '-b', type=int, help='The minibatch size.', default='128', show_default=True,
+)
+@click.option(
+	'--emb_size', '-e', type=int, help='The size of the learnt embeddings', default='128', show_default=True,
+)
+
+@click.option(
+	'--rate', '-t', type=float, help='The starting learning rate of the Adam Optimizer.', default='0.001', show_default=True,
+)
+@click.option(
+	'--skip', '-s', multiple=True, default=[], show_default=True, help='Label types to skip during training.',
+)
+@click.pass_context
+def embedding_propagation(ctx, batch_size, epochs, margin, emb_size, rate, input, output, skip):
+	"""
+	Trains the EP-SP algorithm (reimplementation of "Learning graph representations with embedding propagation", Duran and Niepert, 2017)
+	with the given input graph and hyperparameters.
+	"""
+	from scripts.embedding_propagation import epsp
+
+	if not os.path.exists(output):
+		os.mkdir(output)
+
+	ep = epsp.EmbeddingPropagation(margin=margin, epochs=epochs, batch_size=batch_size, learning_rate=rate, 
+								embedding_size=emb_size, filename=input, output_dir=output, skip=skip)
+	ep.train()
+
+
+
+@main.command()
+@click.option(
+	'--graph', '-g', type=str, help='The path to the Cora graph, output by the cli.py cora command.', default='data/cora/graphs/cora_graph.pkl', show_default=True,
+)
+@click.option(
+	'--embeddings', '-e', type=str, help='The path to the concatenated Cora node embeddings.', default='data/cora/embeddings/merged_node_embeddings.pkl', show_default=True,
+)
+@click.option(
+	'--seed', '-s', type=int, help='The random seed for the experiment, != 0.', default=0, show_default=True,
+)
+@click.option(
+	'--iterations', '-i', type=int, help='The number of runs. Each run has a different random seed. Ignored if the -s option is used.', default='10', show_default=True,
+)
+@click.option(
+	'--num_instances', '-n', type=int, help='The number of instances per class for training.', default='20', show_default=True,
+)
+@click.option(
+	'--num_test', '-t', type=int, help='The number of random test instances.', default='1000', show_default=True,
+)
+@click.option(
+	'--regularization', '-c', type=float, help='Inverse of regularization strength.', default='0.1', show_default=True,
+)
+@click.pass_context
+def node_classification(ctx, graph, embeddings, seed, iterations, num_instances, num_test, regularization):
+	"""
+	Runs the node classification experiment described in Section 2.3.
+	"""
+	from scripts.node_classification import nc_experiment as nc
+
+	if seed:
+		nc.node_classification(path_graph=graph, path_embeddings=embeddings, seed=seed, num_per_class=num_instances, C=regularization)
+	else:
+		nc.node_classification_random_seeds(path_graph=graph, path_embeddings=embeddings, num_test_instances=num_test, 
+											num_per_class=num_instances, iterations=iterations, C=regularization)
+
+
+
+
+@main.command()
+@click.option(
+	'--input', '-i', type=str, help='The path to the raw Cora files', default='data/cora/raw/', show_default=True,
+)
+@click.option(
+	'--output', '-o', type=str, help='The output path + filename for the pickled graph', default='data/cora/graphs/graph.pkl', show_default=True,
+)
+@click.pass_context
+def process_cora(ctx, input, output):
+	"""
+	Creates and pickles the Cora graph.
+	"""
+	from scripts.preprocessing.cora import cora
+
+	path_nodes = "{}/cora.content".format(input)
+	path_edges = "{}/cora.cites".format(input)
+
+	cora.write_pickle_graph_file(path_nodes=path_nodes, path_edges=path_edges, output_path=output)
+
+@main.command()
+@click.option(
+	'--senseval', '-s', type=click.Choice(['2', '3']), help='Whether to run the experiment on SensEval 2 or 3.', default='3', show_default=True,
+)
+@click.option(
+	'--embeddings', '-e', type=str, help='Embeddings file to load.', default='data/wordnet/embeddings/epoch_500/wsd_node_embeddings.pkl', show_default=True,
+)
+@click.option(
+	'--id_map', '-i', type=str, help='The path to the .json id mapping.', default='data/wordnet/mappings/json/id_mapping.json', show_default=True,
+)
+@click.option(
+	'--lemma_map', '-l', type=str, help='The path to the .txt lemma mapping.', default='data/wordnet/mappings/txt/lemmata_mapping2.txt', show_default=True,
+)
+@click.option(
+	'--sense_key', '-k', type=str, help='The path to the wn30->wn17 mapping.', default='data/wordnet/mappings/txt/wn30_wn17_long-wn17_pos.txt', show_default=True,
+)
+@click.option(
+	'--output', '-o', type=str, help='Name of the answer file. No path please.', default='wsd1', show_default=True,
+)
+@click.pass_context
+def wsd_1(ctx, senseval, embeddings, id_map, lemma_map, sense_key, output):
+	"""
+	Runs Method 1 of the Word Sense Disambiguation experiment, as described in Section 3.3.2.
+	"""
+	from scripts.wsd import wsd_method1 as wsd
+
+	senseval_path = 'data/senseval{}/'.format(senseval)
+
+	input_file = os.path.join(senseval_path, 'processed/ambig_sents.json')
+
+	output_file =  '{}wsd_answers/{}'.format(senseval_path, output)
+	if os.path.exists(output_file):
+		print('File {} already exists. Please delete old file or give another filename (-o).'.format(output_file))
+		sys.exit(1)
+
+	embeddings = wsd.open_embedding_file(embeddings)
+
+	id_mapping= wsd.open_mapping(id_map)
+	lemmata_mapping = wsd.open_lem_map(lemma_map)
+
+	sense_key_mapping = wsd.open_sense_keys(sense_key)
+
+	solutions = wsd.iterate_over(input_file, embeddings, lemmata_mapping, id_mapping, sense_key_mapping)
+	wsd.write_answer_to_file(solutions, output_file)
+
+	print('WSD answers saved under', output_file)
+
+
+@main.command()
+@click.option(
+	'--context', '-c', type=int, help='The size of the context window.', default=5, show_default=True,
+)
+@click.option(
+	'--senseval', '-s', type=click.Choice(['2', '3']), help='Whether to run the experiment on SensEval 2 or 3.', default='3', show_default=True,
+)
+@click.option(
+	'--output', '-o', type=str, help='Name of the answer file. No path please.', default='wsd2', show_default=True,
+)
+@click.option(
+	'--mappings', '-m', type=str, help='The path to the pickled WordNet mappings.', default='data/wordnet/mappings/pickled/', show_default=True,
+)
+@click.option(
+	'--embeddings', '-e', type=str, help='The path to the pickled WordNet node embeddings.', default='data/wordnet/embeddings/epoch_500/', show_default=True,
+)
+@click.option(
+	'--first_sense', '-fs', is_flag=True, help='Whether the "First WordNet Sense" baseline should be used.',
+)
+@click.option(
+	'--info', '-i', is_flag=True, help='Whether to print some info on the console.',
+)
+@click.pass_context
+def wsd_2(ctx, context, output, senseval, mappings, embeddings, first_sense, info):
+	"""
+	Runs Method 2 of the Word Sense Disambiguation experiment, as described in Section 3.3.2.
+	"""
+	from scripts.wsd import wsd_method2 as wsd, wsd2_exp as exp
+
+	senseval_path = 'data/senseval{}/'.format(senseval)
+	output_file =  '{}wsd_answers/{}_{}'.format(senseval_path, output, context)
+	if os.path.exists(output_file):
+		print('File {} already exists. Please delete old file or give another filename (-o).'.format(output_file))
+		sys.exit(1)
+
+	input_path = senseval_path + 'processed/'
+	docs = []
+
+	for file in sorted(os.listdir(input_path)):
+		if not file.endswith('.pkl'): continue
+		print('Processing {}...'.format(file))
+		docs.append(wsd.read_pkl(os.path.join(input_path, file)))
+
+
+	# get all required mappings
+	id_map = wsd.read_pkl(mappings + '/id_mapping.pkl')
+	reverse_id_map = wsd.read_pkl(mappings + 'reverse_id.pkl')
+	gloss_map = wsd.read_pkl(mappings + 'gloss_mapping.pkl')
+	lemma_map = wsd.read_pkl(mappings + 'lemma_mapping.pkl')
+	pos_map = wsd.read_pkl(mappings + 'pos_mapping.pkl')
+	wordnet_map = wsd.read_pkl(mappings + '/pickled_wn_mapping.pkl')
+
+	# get required embeddings
+	gloss_emb = wsd.read_pkl(embeddings + '/embedding_matrix_gloss')  # dummy gloss embeddings
+	lemma_emb = wsd.read_pkl(embeddings + '/embedding_matrix_lemma')
+	pos_emb = wsd.read_pkl(embeddings + '/embedding_matrix_pos')
+	node_emb = wsd.read_pkl(embeddings + '/wsd_node_embeddings.pkl')
+
+	disambiguator = wsd.WSD(id_map, reverse_id_map, gloss_map, gloss_emb, lemma_map,
+							lemma_emb, pos_map, pos_emb, node_emb, wordnet_map)
+
+	for doc in docs:
+		exp.run_experiment(window_size=context, document=doc, disambiguator=disambiguator, output_file=output_file, mfs=first_sense, info=info)
+
+	print('WSD answers saved under', output_file)
+
+
+@main.command()
+@click.option(
+	'--answers', '-a', type=str, help='Name of the answer file. No path please.', default='wsd1', show_default=True,
+)
+@click.option(
+	'--senseval', '-s', type=click.Choice(['2', '3']), help='Whether to use the SensEval 2 or 3 answer key.', default='3', show_default=True,
+)
+@click.pass_context
+def score_wsd(ctx, answers, senseval):
+	"""
+	Calls the official SensEval Scorer on some system output.
+	"""
+	import subprocess
+
+	senseval_path = 'data/senseval{}/'.format(senseval)
+	answers = senseval_path + 'wsd_answers/' + answers
+
+	if senseval == 2:
+		key = senseval_path + 'raw/key_senseval2'
+	else:
+		key = senseval_path + 'raw/EnglishAW.test.key'
+
+	command = ['scripts/scoring/scorer2', answers, key]
+	subprocess.check_call(command)
+
+
+
+if __name__ == "__main__":
+	main()
+