Skip to content
Snippets Groups Projects
main.py 6.49 KiB
Newer Older
kulcsar's avatar
kulcsar committed
import argparse
kulcsar's avatar
kulcsar committed
import Code.preprocess
import Code.train
import Code.evaluation
import Code.models
kulcsar's avatar
kulcsar committed
import copy
from transformers import BertTokenizer, RobertaTokenizer, BertModel, RobertaModel, RobertaPreTrainedModel, RobertaConfig,  BertConfig, BertPreTrainedModel, PreTrainedModel, AutoConfig, AutoModel, AutoTokenizer
kulcsar's avatar
kulcsar committed

from typing import List


def run(raw_args):
	#load test and train dataset as well as tokenizers and models...

	#Datasets
kulcsar's avatar
kulcsar committed
	print("opened datasets...")
kulcsar's avatar
kulcsar committed
	with open(args.train_dataset) as f:
		data_train=json.loads(f.read())

	with open(args.test_dataset) as f:
kulcsar's avatar
kulcsar committed
		data_test=json.loads(f.read())
	

	#Tokenizers & Models
kulcsar's avatar
kulcsar committed
	print("Loading tokenizers and initializing model...")
kulcsar's avatar
kulcsar committed
	tokenizer=AutoTokenizer.from_pretrained(args.architecture)

kulcsar's avatar
kulcsar committed
	Code.models.set_seed(args.random_seed)
	if args.model_type == "separate":
kulcsar's avatar
kulcsar committed
		if args.architecture=="bert-base-uncased":
kulcsar's avatar
kulcsar committed
			model=Code.models.BertForWordClassification.from_pretrained(args.architecture).to("cuda")
kulcsar's avatar
kulcsar committed
		elif args.architecture=="roberta-base":
kulcsar's avatar
kulcsar committed
			model=Code.models.RobertaForWordClassification.from_pretrained(args.architecture).to("cuda")
kulcsar's avatar
kulcsar committed
		else:
			print("non eligible model type selected")
	elif args.model_type == "one":
kulcsar's avatar
kulcsar committed
		model=Code.models.WordClassificationModel(args.architecture, args.tmix, args.imdb).to("cuda")
kulcsar's avatar
kulcsar committed
	else:
		print("non eligible model type selected")


	#preprocess...
kulcsar's avatar
kulcsar committed
	print("preprocessing datasets...")
	if args.imdb==True:
kulcsar's avatar
kulcsar committed
		train_dataset=Code.preprocess.tokenizer_imdb(tokenizer, data_train, args.max_length)
		test_dataset=Code.preprocess.tokenizer_imdb(tokenizer, data_test, args.max_length)

	elif args.tokenizer=="salami":
kulcsar's avatar
kulcsar committed
		train_dataset=Code.preprocess.salami_tokenizer(tokenizer, data_train, args.max_length, masked=args.masking) #no context implemented
		test_dataset=Code.preprocess.salami_tokenizer(tokenizer, data_test, args.max_length, masked=args.masking)
kulcsar's avatar
kulcsar committed
	
	elif args.tokenizer=="swp":
		print("train dataset preprocessing ")
kulcsar's avatar
kulcsar committed
		train_dataset=Code.preprocess.tokenizer_new(tokenizer, data_train, args.max_length, masked=args.masking, old_dataset=args.tcontext)
		test_dataset=Code.preprocess.tokenizer_new(tokenizer, data_test, args.max_length, masked=args.masking, old_dataset=False) 
kulcsar's avatar
kulcsar committed
	
	elif args.tokenizer=="li":
kulcsar's avatar
kulcsar committed
		train_dataset=Code.preprocess.tokenizer_new(tokenizer, data_train, args.max_length, masked=args.masking) #no context implemented 
		test_dataset=Code.preprocess.tokenizer_new(tokenizer, data_test, args.max_length, masked=args.masking)
kulcsar's avatar
kulcsar committed
	else:
		print("non eligible tokenizer selected")

kulcsar's avatar
kulcsar committed
	#train&evaluate...
kulcsar's avatar
kulcsar committed
	print("training..")
kulcsar's avatar
kulcsar committed
	if args.train_loop=="swp":
		evaluation_test, evaluation_train = Code.train.train(model, args.architecture, train_dataset, test_dataset, args.random_seed,args.batch_size, args.test_batch_size,args.epochs,args.imdb,  args.mix_up, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer,   args.learning_rate, args.second_learning_rate, args.model_save_path)
kulcsar's avatar
kulcsar committed
	elif args.train_loop=="salami":
kulcsar's avatar
kulcsar committed
		evaluation_test = Code.train.train_salami(model,args.random_seed, train_dataset, test_dataset, args.batch_size, args.test_batch_size, args.learning_rate, args.epochs)
kulcsar's avatar
kulcsar committed
	else:
		print("no eligible train loop selected")
kulcsar's avatar
kulcsar committed
	if isinstance(args.save_directory, str): 
		with open(args.save_directory, "x") as f:
			f.write(str(args))
			f.write(str(evaluation_test))
			f.write(str(evaluation_train))
kulcsar's avatar
kulcsar committed
	print("saved and done")
kulcsar's avatar
kulcsar committed

kulcsar's avatar
kulcsar committed
if __name__ == "__main__":
kulcsar's avatar
kulcsar committed
	parser = argparse.ArgumentParser()

	#Architecture
	parser.add_argument(
		"--architecture",
		help="Model to train",
		choices=["bert-base-uncased", "roberta-base"])

	parser.add_argument(
		"--model_type",
		help="How to initialize the Classification Model",
		choices=["separate", "one"])
	
	parser.add_argument(
		"--tmix", 
kulcsar's avatar
kulcsar committed
		help="whether or not to use tmix. if yes, please specify layer and lambda",
		action="store_true"
	)

	parser.add_argument(
		"--mixlayer",
kulcsar's avatar
kulcsar committed
		help="specify the layer to mix. Only select one layer at a time",
		type=int
	parser.add_argument(
		"--imdb",
		help="whether or not to use the imdb dataset",
		action="store_true"
	)
kulcsar's avatar
kulcsar committed

	parser.add_argument(
		"--mlp", 
		help="use two layer multi layer perceptron at the end? (if no, linear classifier)",
		action="store_true"
	)

kulcsar's avatar
kulcsar committed
	#Datasets
	parser.add_argument(
		"-t",
		"--train_dataset",
		help="Dataset to train on",
		required=True)
	parser.add_argument(
		"-v",
		"--test_dataset",
		help="Dataset to test on",
		required=True)
kulcsar's avatar
kulcsar committed
	#Preprocess arguments
	parser.add_argument(
		"--tokenizer",
		choices=["salami", "li", "swp"],
		help="Which tokenizer to use when preprocessing the datasets")
kulcsar's avatar
kulcsar committed
	parser.add_argument(
kulcsar's avatar
kulcsar committed
		"-tc",
		"--tcontext",
kulcsar's avatar
kulcsar committed
		action="store_true",
		help="whether or not to preprocess train set with context")
kulcsar's avatar
kulcsar committed

	parser.add_argument(
		"-vc",
		"--vcontext",
kulcsar's avatar
kulcsar committed
        action="store_true",
		help="whether or not to preprocess the test set with context")
kulcsar's avatar
kulcsar committed

kulcsar's avatar
kulcsar committed
	parser.add_argument(
		"--masking",
kulcsar's avatar
kulcsar committed
		action="store_true",
		help="whether or not to mask the target word")
kulcsar's avatar
kulcsar committed
	parser.add_argument(
		"-max",
		"--max_length",
		help="Max sequence length when tokenizing the sentences?")	
kulcsar's avatar
kulcsar committed


	#Train arguments
	parser.add_argument(
		"--train_loop",
		choices=["salami", "swp"],
		help="Which Train loop to use")
kulcsar's avatar
kulcsar committed

kulcsar's avatar
kulcsar committed
	parser.add_argument(
		"-e",
		"--epochs",
kulcsar's avatar
kulcsar committed
		help="Number of epochs for training")
kulcsar's avatar
kulcsar committed

kulcsar's avatar
kulcsar committed
	parser.add_argument(
		"-lr",
		"--learning_rate",
kulcsar's avatar
kulcsar committed
		help="Learning rate for training")
	
	parser.add_argument(
		"-lrtwo", 
		"--second_learning_rate",
		type=float,
		help="Separate learning rate for multi layer perceptron", 
		default=None
	)
kulcsar's avatar
kulcsar committed

kulcsar's avatar
kulcsar committed
	parser.add_argument(
		"-rs",
		"--random_seed",
		type=int,
		default=42,
kulcsar's avatar
kulcsar committed
		help="Random seed for initialization of model")
kulcsar's avatar
kulcsar committed

kulcsar's avatar
kulcsar committed
	parser.add_argument(
		"-b",
		"--batch_size",
		help="The batch size for the training process",
		type=int,
kulcsar's avatar
kulcsar committed
		default=32)

	parser.add_argument(
		"-mixup",
		"--mix_up",
		help="whether or not to apply mixup during training",
		action="store_true")

kulcsar's avatar
kulcsar committed
	parser.add_argument(
		"-lambda",
		"--lambda_value",
		help="speficies the lambda value for mixup",
kulcsar's avatar
kulcsar committed

kulcsar's avatar
kulcsar committed
	parser.add_argument(
kulcsar's avatar
kulcsar committed
		"-mixepoch", 
		"--mixepoch", 
kulcsar's avatar
kulcsar committed
		help="specify the epoch(s) in which to apply mixup",
		type=int,
kulcsar's avatar
kulcsar committed
		default=None) 
kulcsar's avatar
kulcsar committed

kulcsar's avatar
kulcsar committed

	#Test arguments
	parser.add_argument(
		"-tb",
		"--test_batch_size",
umlauf's avatar
umlauf committed
		help="The batch size for the test process",
kulcsar's avatar
kulcsar committed
		type=int,
kulcsar's avatar
kulcsar committed
		default=16)
kulcsar's avatar
kulcsar committed

kulcsar's avatar
kulcsar committed
	#Save and Organisation
	parser.add_argument(
		"-sd",
		"--save_directory",
friebolin's avatar
friebolin committed
		help="Destination directory for the output results of the run")
kulcsar's avatar
kulcsar committed
	
	parser.add_argument(
		"-msp",
		"--model_save_path",
		help="path to save model")
kulcsar's avatar
kulcsar committed
	args = parser.parse_args()
	run(args)
kulcsar's avatar
kulcsar committed