def preprocess_function(examples):
# 두번째 문장이 없는 task의 경우
if sentence2_key is None:
return tokenizer(examples[sentence1_key], truncation=True)
return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)
def compute_metrics(eval_pred):
predictions, labels = eval_pred
# task/metric에 따라 계산 방식이 달라짐
if task != "stsb":
predictions = np.argmax(predictions, axis=1)
else:
predictions = predictions[:, 0]
return metric.compute(predictions=predictions, references=labels)
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
# pretrained model setting
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
# tokenizer load
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
# 문장 1, 2에 대한 설정
# 현재 task 에서는 sentence1_key: sentence, sentence2_key: None
sentence1_key, sentence2_key = task_to_keys[task]
# caching을 통해 재실행시 다시 결과를 불러오는 것을 방지
# load_from_cache_file=False 로 새롭게 불러올 수 있음
# batched=True: muti threading
encoded_dataset = dataset.map(preprocess_function, batched=True)
# model load
# classification을 위한 head 설정
num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
# metric 설정
metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"
model_name = model_checkpoint.split("/")[-1]
# 훈련을 위한 파라 미터 설정
args = TrainingArguments(
"test-glue",
evaluation_strategy = "epoch", # 에포크에 따라 평가 진행
save_strategy = "epoch", # 에포크에 따라 저장
learning_rate=2e-5, # 학습률
per_device_train_batch_size=batch_size, # 훈련 batch size
per_device_eval_batch_size=batch_size, # 평가 batch size
num_train_epochs=5, # 학습 epoch
weight_decay=0.01, # learning_rage scheduler
load_best_model_at_end=True, # 마지막에 최고 성능 모델 load
metric_for_best_model=metric_name, # 평가 기준
push_to_hub=True, # hugging face hub에 올릴건지
push_to_hub_model_id=f"{model_name}-finetuned-{task}",
)
# 훈련 class 생성
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
trainer = Trainer(
model,
args,
train_dataset=encoded_dataset["train"],
eval_dataset=encoded_dataset[validation_key],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
# 훈련
trainer.train()
# 평가
trainer.evaluate()
# Hyper parameter search
def model_init():
return AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
# model을 model_init 함수로 설정
trainer = Trainer(
model_init=model_init,
args=args,
train_dataset=encoded_dataset["train"],
eval_dataset=encoded_dataset[validation_key],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
# 가장 성능이 좋은 조합의 hyperparameter 반환
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")
# 아래 코드를 이용해 가장 좋았던 파라미터 재 setting 가능
for n, v in best_run.hyperparameters.items():
setattr(trainer.args, n, v)