Newer
Older
import argparse
import preprocess
import train
import evaluation
import models
from transformers import BertTokenizer, RobertaTokenizer, BertModel, RobertaModel, RobertaPreTrainedModel, RobertaConfig, BertConfig, BertPreTrainedModel, PreTrainedModel, AutoConfig, AutoModel, AutoTokenizer
from typing import List
def run(raw_args):
#load test and train dataset as well as tokenizers and models...
#Datasets
with open(args.train_dataset) as f:
data_train=json.loads(f.read())
with open(args.test_dataset) as f:
data_test=json.loads(f.read())
#Tokenizers & Models
tokenizer=AutoTokenizer.from_pretrained(args.architecture)
models.set_seed(args.random_seed)
if args.model_type == "separate":
model=models.BertForWordClassification.from_pretrained(args.architecture).to("cuda")
model=models.RobertaForWordClassification.from_pretrained(args.architecture).to("cuda")
else:
print("non eligible model type selected")
elif args.model_type == "one":
model=models.WordClassificationModel(args.architecture, args.tmix, args.imdb).to("cuda")
else:
print("non eligible model type selected")
#preprocess...
if args.imdb==True:
train_dataset=preprocess.tokenizer_imdb(tokenizer, data_train, args.max_length)
test_dataset=preprocess.tokenizer_imdb(tokenizer, data_test, args.max_length)
elif args.tokenizer=="salami":
train_dataset=preprocess.salami_tokenizer(tokenizer, data_train, args.max_length, masked=args.masking) #no context implemented
test_dataset=preprocess.salami_tokenizer(tokenizer, data_test, args.max_length, masked=args.masking)
elif args.tokenizer=="swp":
print("train dataset preprocessing ")
print(args.tcontext)
train_dataset=preprocess.tokenizer_new(tokenizer, data_train, args.max_length, masked=args.masking, old_dataset=args.tcontext)
test_dataset=preprocess.tokenizer_new(tokenizer, data_test, args.max_length, masked=args.masking, old_dataset=False)
elif args.tokenizer=="li":
train_dataset=preprocess.tokenizer_new(tokenizer, data_train, args.max_length, masked=args.masking) #no context implemented
test_dataset=preprocess.tokenizer_new(tokenizer, data_test, args.max_length, masked=args.masking)
evaluation_test, evaluation_train = train.train(model, args.architecture, args.imdb, args.random_seed, args.mix_up, args.lambda_value, args.mixepoch, args.tmix, args.mixlayer, train_dataset, test_dataset, args.epochs, args.learning_rate, args.batch_size, args.test_batch_size)
evaluation_test = train.train_salami(model,args.random_seed, train_dataset, test_dataset, args.batch_size, args.test_batch_size, args.learning_rate, args.epochs)
if isinstance(args.save_directory, str):
with open(args.save_directory, "x") as f:
f.write(str(args))
f.write(str(evaluation_test))
f.write(str(evaluation_train))
parser = argparse.ArgumentParser()
#Architecture
parser.add_argument(
"--architecture",
help="Model to train",
choices=["bert-base-uncased", "roberta-base"])
parser.add_argument(
"--model_type",
help="How to initialize the Classification Model",
choices=["separate", "one"])
help="whether or not to use tmix. if yes, please specify layer and lambda",
action="store_true"
)
parser.add_argument(
"--mixlayer",
help="specify the layer to mix. Only select one layer at a time",
parser.add_argument(
"--imdb",
help="whether or not to use the imdb dataset",
action="store_true"
)
#Datasets
parser.add_argument(
"-t",
"--train_dataset",
help="Dataset to train on",
required=True)
parser.add_argument(
"-v",
"--test_dataset",
help="Dataset to test on",
required=True)
#Preprocess arguments
parser.add_argument(
"--tokenizer",
choices=["salami", "li", "swp"],
help="Which tokenizer to use when preprocessing the datasets")
action="store_true",
help="whether or not to preprocess train set with context")
action="store_true",
help="whether or not to preprocess the test set with context")
action="store_true",
help="whether or not to mask the target word")
help="How big is max length when tokenizing the sentences?")
#Train arguments
parser.add_argument(
"--train_loop",
choices=["salami", "swp"],
help="Which Train loop to use")
parser.add_argument(
"-b",
"--batch_size",
help="The batch size for the training process",
type=int,
default=32)
parser.add_argument(
"-mixup",
"--mix_up",
help="whether or not to apply mixup during training",
action="store_true")
parser.add_argument(
"-lambda",
"--lambda_value",
help="speficies the lambda value for mixup",
type=float,
default=0.4)
help="specify the epoch(s) in which to apply mixup",
type=int,
default=1)
#Test arguments
parser.add_argument(
"-tb",
"--test_batch_size",
#Save and Organisation
parser.add_argument(
"-sd",
"--save_directory",
help="Directory to save run")