Newer
Older
import argparse
import preprocess
import train
import evaluation
import models
from transformers import BertTokenizer, RobertaTokenizer, BertModel, RobertaModel, RobertaPreTrainedModel, RobertaConfig, BertConfig, BertPreTrainedModel, PreTrainedModel, AutoConfig, AutoModel, AutoTokenizer
#print("parsing")
#args=_parse_args(raw_args)
#print("parsed arguments")
#load test and train dataset as well as tokenizers and models...
#Datasets
with open(args.train_dataset) as f:
data_train=json.loads(f.read())
with open(args.test_dataset) as f:
data_test=json.loads(f.read())
#Tokenizers & Models
tokenizer=AutoTokenizer.from_pretrained(args.architecture)
models.set_seed(args.random_seed)
if args.model_type == "separate":
model=models.BertForWordClassification.from_pretrained(args.architecture).to("cuda")
model=models.RobertaForWordClassification.from_pretrained(args.architecture).to("cuda")
else:
print("non eligible model type selected")
elif args.model_type == "one":
model=models.WordClassificationModel(args.architecture, args.tmix, args.imdb).to("cuda")
else:
print("non eligible model type selected")
#preprocess...
if args.imdb==True:
train_dataset=preprocess.tokenizer_imdb(tokenizer, data_train, args.max_length)
test_dataset=preprocess.tokenizer_imdb(tokenizer, data_test, args.max_length)
elif args.tokenizer=="salami":
train_dataset=preprocess.salami_tokenizer(tokenizer, data_train, args.max_length, masked=args.masking) #no context implemented
test_dataset=preprocess.salami_tokenizer(tokenizer, data_test, args.max_length, masked=args.masking)
elif args.tokenizer=="swp":
print("train dataset preprocessing ")
print(args.tcontext)
train_dataset=preprocess.tokenizer_new(tokenizer, data_train, args.max_length, masked=args.masking, old_dataset=args.tcontext)
#print("test dataset preprocesssing ")
#print(args.vcontext)
test_dataset=preprocess.tokenizer_new(tokenizer, data_test, args.max_length, masked=args.masking, old_dataset=False)
elif args.tokenizer=="li":
train_dataset=preprocess.tokenizer_new(tokenizer, data_train, args.max_length, masked=args.masking) #no context implemented
test_dataset=preprocess.tokenizer_new(tokenizer, data_test, args.max_length, masked=args.masking)
else:
print("non eligible tokenizer selected")
#train...
evaluation_test, evaluation_train = train.train(model, args.architecture, args.imdb, args.random_seed, args.gradient_accumulation_steps, args.mix_up, args.threshold, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer, train_dataset, test_dataset, args.epochs, args.learning_rate, args.batch_size, args.test_batch_size)
evaluation_test = train.train_salami(model,args.random_seed, train_dataset, test_dataset, args.batch_size, args.test_batch_size, args.learning_rate, args.epochs)
#(evaluate... is done internally) but could maybe be implemented here to make average over multiple random seeds
if isinstance(args.save_directory, str):
with open(args.save_directory, "x") as f:
f.write(str(args))
f.write(str(evaluation_test))
f.write(str(evaluation_train))
parser = argparse.ArgumentParser()
#Architecture
parser.add_argument(
"--architecture",
help="Model to train",
choices=["bert-base-uncased", "roberta-base"])
parser.add_argument(
"--model_type",
help="How to initialize the Classification Model",
choices=["separate", "one"])
help="whether or not to use tmix. if yes, please specify layer and lambda",
action="store_true"
)
parser.add_argument(
"--mixlayer",
help="specify the layer to mix. Only select one layer at a time",
parser.add_argument(
"--imdb",
help="whether or not to use the imdb dataset",
action="store_true"
)
#Datasets
parser.add_argument(
"-t",
"--train_dataset",
help="Dataset to train on",
required=True)
parser.add_argument(
"-v",
"--test_dataset",
help="Dataset to test on",
required=True)
#Preprocess arguments
parser.add_argument(
"--tokenizer",
choices=["salami", "li", "swp"],
help="Which tokenizer to use when preprocessing the datasets")
action="store_true",
#default=False,
#type=bool,
help="whether or not to preprocess train set with context")
#default=False,
#type=bool,
action="store_true",
help="whether or not to preprocess the test set with context")
#default=False,
#type=bool,
action="store_true",
help="whether or not to mask the target word")
help="How big is max length when tokenizing the sentences?")
#Train arguments
parser.add_argument(
"--train_loop",
choices=["salami", "swp"],
help="Which Train loop to use")
parser.add_argument(
"-b",
"--batch_size",
help="The batch size for the training process",
type=int,
default=32)
parser.add_argument(
"-gras",
"--gradient_accumulation_steps",
help="gradient accumulation steps for training",
type=int,
default=1)
parser.add_argument(
"-mixup",
"--mix_up",
help="whether or not to apply mixup during training",
action="store_true")
parser.add_argument(
"-threshold",
"--threshold",
help="specifies the value for mixup threshold",
type=float,
default=0.05)
parser.add_argument(
"-lambda",
"--lambda_value",
help="speficies the lambda value for mixup",
type=float,
default=0.4)
help="specify the epoch(s) in which to apply mixup",
type=int,
default=1)
#Test arguments
parser.add_argument(
"-tb",
"--test_batch_size",
#Save and Organisation
parser.add_argument(
"-sd",
"--save_directory",
help="Directory to save run")