Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
A
Automated Diagnosing of Patients Based on Physiological Data Using Language Models
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
kulcsar
Automated Diagnosing of Patients Based on Physiological Data Using Language Models
Commits
f95c1cf6
Commit
f95c1cf6
authored
1 year ago
by
kulcsar
Browse files
Options
Downloads
Patches
Plain Diff
clean up some code
parent
550a5c4a
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
evaluate_model_2.py
+6
-73
6 additions, 73 deletions
evaluate_model_2.py
with
6 additions
and
73 deletions
evaluate_model_2.py
+
6
−
73
View file @
f95c1cf6
...
...
@@ -9,12 +9,10 @@ from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Tenso
import
logging
from
transformers
import
get_scheduler
,
AdamW
from
t5_model
import
preprocess
#from accelerate import Accelerator
import
argparse
logging
.
basicConfig
(
filename
=
'
log_t5.log
'
,
level
=
logging
.
DEBUG
)
device
=
torch
.
device
(
'
cuda
'
if
torch
.
cuda
.
is_available
()
else
'
cpu
'
)
#accelerator=Accelerator()
#device=accelerator.device
def
run
():
print
(
args
.
saved_model
)
print
(
args
.
tokenizer
)
...
...
@@ -22,22 +20,14 @@ def run():
logging
.
info
(
"
performing evaluation
"
)
logging
.
info
(
"
loading saved model
"
)
#checkpoint=torch.load(args.saved_model)
#model.load_state_dict(checkpoint)
model
=
BioGptForCausalLM
.
from_pretrained
(
args
.
saved_model
)
#model=AutoModelForCausalLM.from_pretrained(args.saved_model)
#model=T5ForConditionalGeneration.from_pretrained("t5-small")
#model.load_state_dict(torch.load(args.saved_model))
#model=accelerator.load("./t5_small_deepspeed_train_test_2.pt")
#model=accelerator.load_state("./t5_small_deepspee_train_test.pt")
#model=BioGptForCausalLM.from_pretrained(args.saved_model)
model
=
AutoModelForCausalLM
.
from_pretrained
(
args
.
saved_model
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer
)
tokenizer
.
padding_side
=
"
left
"
tokenizer
.
pad_token
=
tokenizer
.
eos_token
model
.
config
.
pad_token_id
=
model
.
config
.
eos_token_id
test_dataset
=
preprocess
(
tokenizer
,
args
.
test_dataset
)
#with open(args.test_dataset, "rb") as f:
# data=pkl.load(f)
#test_dataset=DiagnosesDataset(data, tokenizer)
logging
.
info
(
"
running evaluation
"
)
res
=
evaluate_model_loop
(
model
,
args
.
config_name
,
test_dataset
,
args
.
batch_size
,
tokenizer
,
args
.
topk
,
args
.
temp
,
args
.
num_beams
,
args
.
early_stopping
,
args
.
no_rep_ngram
,
args
.
num_return_sequences
,
args
.
metrics
,
args
.
do_sample
,
args
.
generative
,
args
.
icd_codes
)
logging
.
info
(
res
)
...
...
@@ -62,25 +52,9 @@ class DiagnosesDataset(torch.utils.data.Dataset):
label_instruction
=
self
.
tokenizer
(
labels
)
i
=
len
(
tokenized_instruction
[
"
input_ids
"
])
-
1
#while 1<len(item["input_ids"])
#print("Len of item labels before ", len(item["labels"]))
item
[
"
labels
"
][
i
:]
=
label_instruction
[
"
input_ids
"
]
#item.pop("token_type_ids")
#print(item["labels"])
#we now need to pad to 2048
item
[
"
labels
"
][
i
:]
=
label_instruction
[
"
input_ids
"
]
#print("Len labels: ", len(item["labels"]))
#print("Len input ids: ", len(item["input_ids"]))
#print("\n\n")
#try:
# assert len(item["labels"]) == len(item["input_ids"])
#except AssertionError:
# print(len(item["labels"]))
# print(len(item["input_ids"]))
# print(len(tokenized_instruction["input_ids"]))
# print("\n\n")
# break
return
item
def
tokenize
(
self
,
prompt
):
...
...
@@ -90,24 +64,7 @@ class DiagnosesDataset(torch.utils.data.Dataset):
max_length
=
1024
,
padding
=
False
,
return_tensors
=
None
)
#print(type(result_prompt))
#print(len(result_prompt["input_ids"]))
#result_labels=self.tokenizer(labels,
# truncation=True,
# max_length=1024,
# padding=False,
# return_tensors=None)
#old_labels=result_labels["input_ids"].copy()
#result_prompt["labels"]=[-100 for i in result_prompt["input_ids"]] + result_labels["input_ids"]
#result_prompt["input_ids"]=result_prompt["input_ids"] + old_labels
#print(result_prompt["input_ids"]
#result_prompt["labels"] = [-100 for i in result_prompt["input_ids"]] + result_labels["input_ids"]
#print(len(result_prompt["labels"]))
#assert len(result_prompt["input_ids"]) == len(result_prompt["labels"])
result_prompt
[
"
labels
"
]
=
[
-
100
]
*
len
(
result_prompt
[
"
input_ids
"
])
#print(result_prompt["labels"])
return
result_prompt
def
__len__
(
self
):
...
...
@@ -137,10 +94,6 @@ def evaluate_model_loop(model,config_name, test_dataset, batch_size, tokenizer,
print
(
"
num_return_sequences:
"
,
num_return_sequences
)
print
(
"
metrics:
"
,
metrics
)
print
(
"
generative?
"
,
generative
)
#config=GenerationConfig.from_pretrained(config_name, top_k=top_k, temperature=temp, num_beams=num_beams, early_stopping=early_stopping, no_repeat_ngram_size=no_rep, num_return_sequences=num_return_sequences, max_length=512)
#print(config.num_return_sequences)
#print(tokenizer.max_length)
eval_sampler
=
SequentialSampler
(
test_dataset
)
eval_dataloader
=
DataLoader
(
test_dataset
,
sampler
=
eval_sampler
,
batch_size
=
batch_size
)
accuracies
=
[]
...
...
@@ -149,35 +102,15 @@ def evaluate_model_loop(model,config_name, test_dataset, batch_size, tokenizer,
precs
=
[]
for
index
,
batch
in
tqdm
(
enumerate
(
eval_dataloader
)):
with
torch
.
no_grad
():
#print(batch.to(device))
#if index == 20:
# break
print
(
len
(
batch
[
"
input_ids
"
]))
#input_ids=torch.tensor(batch[0]).unsqueeze(0).to(device)
input_ids
=
batch
[
"
input_ids
"
]
#print(input_ids.size())
attention_mask
=
batch
[
"
attention_mask
"
]
labels
=
batch
[
"
labels
"
]
labels_len
=
len
(
batch
[
2
][
batch
[
2
]
!=
tokenizer
.
pad_token_id
])
input_ids_len
=
len
(
batch
[
0
][
0
])
print
(
"
Len input ids:
"
,
input_ids_len
)
print
(
"
Len labels:
"
,
labels_len
)
#attention_mask=torch.tensor(batch[1]).unsqueeze(0).to(device)
#labels=torch.tensor(batch[2]).unsqueeze(0).to(device)
#last_inp_token_index=batch[2][::-1].index(-100)
#last_occurrence=len(batch[2]) - last_inp_token_index
#print(last_occurrence)
#label_length=len(batch["input_ids"][last_occurrence])
#print(label_length)
#outputs=model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=512, top_k=top_k, temperature=temp, num_beams=num_beams, early_stopping=early_stopping, no_repeat_ngram_size=no_rep, num_return_sequences=num_return_sequences)
outputs
=
model
.
generate
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
top_k
=
top_k
,
temperature
=
temp
,
num_beams
=
num_beams
,
early_stopping
=
early_stopping
,
no_repeat_ngram_size
=
no_rep
,
num_return_sequences
=
num_return_sequences
,
max_new_tokens
=
labels_len
+
8
)
# , length_penalty=-0.8)
#print("Outputs: ",len(outputs[0]))
#print("Len input ids: ", labels_len)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment