diff --git a/.gitignore b/.gitignore index c4a428c1be9e42aa2a52b207ce8464736c05c8ff..e67bbc943106d2a8a2bebdc8cfd731d3317f5281 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ +__pycache__ venv bert-base-uncased-hatexplain-rationale-two diff --git a/test.py b/test.py index 41742ed0238cced81424fa540e18eec4b61e121b..6ddfc178ce1b93b4faa0e7ac0ac3a27cfd19b950 100644 --- a/test.py +++ b/test.py @@ -6,13 +6,14 @@ from nltk.tokenize.treebank import TreebankWordDetokenizer from utils.eval import eval from utils.attack import attack -device = 'cuda' if torch.cuda.is_available() else 'cpu' +device = "cuda" if torch.cuda.is_available() else "cpu" -tokenizer = AutoTokenizer.from_pretrained("Hate-speech-CNERG/bert-base-uncased-hatexplain-rationale-two") -model = \ - Model_Rational_Label.from_pretrained( - "Hate-speech-CNERG/bert-base-uncased-hatexplain-rationale-two" - ) +tokenizer = AutoTokenizer.from_pretrained( + "Hate-speech-CNERG/bert-base-uncased-hatexplain-rationale-two" +) +model = Model_Rational_Label.from_pretrained( + "Hate-speech-CNERG/bert-base-uncased-hatexplain-rationale-two" +) model = model.to(device) @@ -26,16 +27,18 @@ model = model.to(device) # print(f"Normal: {probs[1][0]}\nHatespeech: {probs[1][1]}") # Load test dataset -with open('data/post_id_divisions.json') as splits: +with open("data/post_id_divisions.json") as splits: data = json.load(splits) - test_ids = data['test'] + test_ids = data["test"] + def dataset(ids): - with open('data/dataset.json') as data_file: + with open("data/dataset.json") as data_file: data = json.load(data_file) for i in ids: yield data[i] + counter = 0 batchsize = 8 for post in dataset(test_ids): @@ -43,15 +46,17 @@ for post in dataset(test_ids): # break # counter += 1 - detokenized = TreebankWordDetokenizer().detokenize(post["post_tokens"]) - # batch = attack(detokenized) + text = TreebankWordDetokenizer().detokenize(post["post_tokens"]) - # probabilities = eval(detokenized, model, tokenizer) - probabilities = eval(["this is a test", "this is a tast"], model, tokenizer) + attacks = attack(text, model, tokenizer) + print(attacks) + + probabilities = eval(attacks, model, tokenizer) + # probabilities = eval(["this is a test", "this is a tast"], model, tokenizer) print(probabilities) # print(f"Normal: {probabilities[0][0]}\nHatespeech: {probabilities[0][1]}\n\n") # print(f"Normal: {probabilities[1][0]}\nHatespeech: {probabilities[1][1]}\n\n") - + # ATTACK HERE # batch = attack(detokenized) @@ -68,4 +73,3 @@ for post in dataset(test_ids): # print(post["post_id"]) # print(post["annotators"][0]["label"]) # print(TreebankWordDetokenizer().detokenize(post["post_tokens"])) - diff --git a/utils/__pycache__/attack.cpython-38.pyc b/utils/__pycache__/attack.cpython-38.pyc deleted file mode 100644 index 775aa2dc19cfcc635d2bb60490f0cc6bc040346b..0000000000000000000000000000000000000000 Binary files a/utils/__pycache__/attack.cpython-38.pyc and /dev/null differ diff --git a/utils/__pycache__/eval.cpython-38.pyc b/utils/__pycache__/eval.cpython-38.pyc deleted file mode 100644 index af701cdd725a97b7278e7893c946fb1cf3c62780..0000000000000000000000000000000000000000 Binary files a/utils/__pycache__/eval.cpython-38.pyc and /dev/null differ diff --git a/utils/attack.py b/utils/attack.py index d6d1c127209cc7e5d17fb3c1e11664d7444dabb5..506214ecb3d035621a00751d477ae38d00ad7ead 100644 --- a/utils/attack.py +++ b/utils/attack.py @@ -1,5 +1,96 @@ +from typing import Union import transformers +import string -def attack(sentence, model, tokenizer): +def attack(text, model, tokenizer, subs=1, top_k=5): + """ + Return adversarial examples + + Parameters + ---------- + text : str + Text to be attacked/modified. + model : transformers.AutoModelForSequenceClassification + Victim model, trained HateXplain model + tokenizer : transformers.AutoTokenizer + Tokenizer from trained HateXplain model + subs : int + Number of character substitutions. + Default: 1 + top_k : int + Return this many of the best candidates. Best is determined by how much + they influence the probability scores + Default: 5 + + Returns + ------- + attacks : List[str] + List of the `top_k` attacks on the input text + """ + device = 'cuda' if torch.cuda.is_available() else 'cpu' model = model.to(device) + # Compute probabilities prior to the attacks + # inputs = tokenizer( + # text, + # return_tensors="pt", + # padding=True + # ).to(device) + # prediction_logits, _ = model( + # input_ids=inputs['input_ids'], + # attention_mask=inputs['attention_mask'] + # ) + # softmax = torch.nn.Softmax(dim=1) + # prior_probabilities = softmax(prediction_logits) + # prior_hatespeech_probability = prior_probabilities[0][1] + + prior_hatespeech_probability = eval(text, model, tokenizer)[0][1] + + # Generate attacks + candidate_scores = {} + for i, char in enumerate(text): + for candidate in generate_candidates(text, i, model, tokenizer): + candidate_probability = eval(candidate, model, tokenizer)[0][1] + + candidate_score = prior_hatespeech_probability - candidate_probability + # higher score is better + candidate_scores[candidate] = candidate_score + + sorted_candidate_scores = dict(sorted(candidate_scores.items(), + key=lambda item: item[1], + reverse=True)) + attacks = list(sorted_candidate_scores)[:top_k] + return attacks + + +def generate_candidates(text, i, model, tokenizer) + """ + Substitute a character in the text with every possible substitution + + Parameters + ---------- + text : str + Text to be attacked/modified. + i : int + Index of character to be substituted + model : transformers.AutoModelForSequenceClassification + Victim model, trained HateXplain model + tokenizer : transformers.AutoTokenizer + Tokenizer from trained HateXplain model + + Yields + ------ + candidate : + List of the `top_k` attacks on the input text + """ + + permissible_substitutions = string.printable + # 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ + + for substitution_char in permissible_substitutions: + if substitution_char == text[i]: + continue + candidate = list(text) + candidate[i] = substitution_char + candidate = "".join(candidate) + yield candidate