Newer
Older
import Code.preprocess
import Code.train
import Code.evaluation
import Code.models
from transformers import BertTokenizer, RobertaTokenizer, BertModel, RobertaModel, RobertaPreTrainedModel, RobertaConfig, BertConfig, BertPreTrainedModel, PreTrainedModel, AutoConfig, AutoModel, AutoTokenizer
from typing import List
def run(raw_args):
#load test and train dataset as well as tokenizers and models...
#Datasets
with open(args.train_dataset) as f:
data_train=json.loads(f.read())
with open(args.test_dataset) as f:
data_test=json.loads(f.read())
#Tokenizers & Models
tokenizer=AutoTokenizer.from_pretrained(args.architecture)
if args.model_type == "separate":
model=Code.models.BertForWordClassification.from_pretrained(args.architecture).to("cuda")
model=Code.models.RobertaForWordClassification.from_pretrained(args.architecture).to("cuda")
else:
print("non eligible model type selected")
elif args.model_type == "one":
model=Code.models.WordClassificationModel(args.architecture, args.tmix, args.imdb).to("cuda")
else:
print("non eligible model type selected")
#preprocess...
train_dataset=Code.preprocess.tokenizer_imdb(tokenizer, data_train, args.max_length)
test_dataset=Code.preprocess.tokenizer_imdb(tokenizer, data_test, args.max_length)
train_dataset=Code.preprocess.salami_tokenizer(tokenizer, data_train, args.max_length, masked=args.masking) #no context implemented
test_dataset=Code.preprocess.salami_tokenizer(tokenizer, data_test, args.max_length, masked=args.masking)
print("train dataset preprocessing ")
print(args.tcontext)
train_dataset=Code.preprocess.tokenizer_new(tokenizer, data_train, args.max_length, masked=args.masking, old_dataset=args.tcontext)
test_dataset=Code.preprocess.tokenizer_new(tokenizer, data_test, args.max_length, masked=args.masking, old_dataset=False)
train_dataset=Code.preprocess.tokenizer_new(tokenizer, data_train, args.max_length, masked=args.masking) #no context implemented
test_dataset=Code.preprocess.tokenizer_new(tokenizer, data_test, args.max_length, masked=args.masking)
evaluation_test, evaluation_train = Code.train.train(model, args.architecture, args.imdb, args.random_seed, args.mix_up, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer, train_dataset, test_dataset, args.epochs, args.learning_rate, args.batch_size, args.test_batch_size, args.model_save_path)
evaluation_test = Code.train.train_salami(model,args.random_seed, train_dataset, test_dataset, args.batch_size, args.test_batch_size, args.learning_rate, args.epochs)
if isinstance(args.save_directory, str):
with open(args.save_directory, "x") as f:
f.write(str(args))
f.write(str(evaluation_test))
f.write(str(evaluation_train))
parser = argparse.ArgumentParser()
#Architecture
parser.add_argument(
"--architecture",
help="Model to train",
choices=["bert-base-uncased", "roberta-base"])
parser.add_argument(
"--model_type",
help="How to initialize the Classification Model",
choices=["separate", "one"])
help="whether or not to use tmix. if yes, please specify layer and lambda",
action="store_true"
)
parser.add_argument(
"--mixlayer",
help="specify the layer to mix. Only select one layer at a time",
parser.add_argument(
"--imdb",
help="whether or not to use the imdb dataset",
action="store_true"
)
#Datasets
parser.add_argument(
"-t",
"--train_dataset",
help="Dataset to train on",
required=True)
parser.add_argument(
"-v",
"--test_dataset",
help="Dataset to test on",
required=True)
#Preprocess arguments
parser.add_argument(
"--tokenizer",
choices=["salami", "li", "swp"],
help="Which tokenizer to use when preprocessing the datasets")
action="store_true",
help="whether or not to preprocess train set with context")
action="store_true",
help="whether or not to preprocess the test set with context")
action="store_true",
help="whether or not to mask the target word")
help="How big is max length when tokenizing the sentences?")
#Train arguments
parser.add_argument(
"--train_loop",
choices=["salami", "swp"],
help="Which Train loop to use")
parser.add_argument(
"-b",
"--batch_size",
help="The batch size for the training process",
type=int,
default=32)
parser.add_argument(
"-mixup",
"--mix_up",
help="whether or not to apply mixup during training",
action="store_true")
parser.add_argument(
"-lambda",
"--lambda_value",
help="speficies the lambda value for mixup",
type=float,
default=0.4)
help="specify the epoch(s) in which to apply mixup",
type=int,
#Test arguments
parser.add_argument(
"-tb",
"--test_batch_size",
#Save and Organisation
parser.add_argument(
"-sd",
"--save_directory",
help="Directory to save run")
parser.add_argument(
"-msp",
"--model_save_path",
help="path to save model")