Пытаюсь тренировать модель HuggingFace
пытаюсь тренировать модель, получаю ошибку, но не понимаю, как её решить. Гугл ничего путного не дал
ValueError: The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,labels,output_attentions,output_hidden_states,return_dict,labels,label,label_ids.
Вот, сам код
## imports
from datasets import Dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
Trainer,
TrainingArguments,
)
## data
# Load 20-newsgroup dataset and arrange it into a list of tuples
# data = [("description1", "category1"), ("description2", "category2"), ...]
newsgroups_train = fetch_20newsgroups(subset="train")
data = [
(
newsgroups_train.data[i],
newsgroups_train.target_names[newsgroups_train.target[i]],
)
for i in range(len(newsgroups_train.data))
]
## Prepare the dataset
descriptions = [item[0] for item in data]
categories = [item[1] for item in data]
# Tokenizer and Model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=len(set(categories))
)
## Encoding data
def encode(examples):
return tokenizer(examples["text"], truncation=True, padding="max_length")
## Train/Test split
(
train_descriptions,
test_descriptions,
train_categories,
test_categories,
) = train_test_split(descriptions, categories, test_size=0.2)
training_args = TrainingArguments("test_trainer")
def hugginface_dataset(text, labels):
return Dataset.from_dict(
{
"text": text,
"labels": labels,
}
)
train_dataset = hugginface_dataset(train_descriptions, train_categories)
test_dataset = hugginface_dataset(test_descriptions, test_categories)
## Define trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=encode(train_dataset),
eval_dataset=encode(test_dataset),
# compute_metrics=compute_metrics,
)
## Train the model
trainer.train()
# Evaluation
train_metrics = trainer.evaluate(train_dataset)
test_metrics = trainer.evaluate(test_dataset)
# Print metrics
print("Training metrics:", train_metrics)
print("Testing metrics:", test_metrics)
# Collect metrics
metrics = {"train": train_metrics, "test": test_metrics}
# Plot metrics
plt.figure(figsize=(10, 6))
for metric in ["loss", "accuracy"]:
plt.plot(
["train", "test"],
[metrics["train"][metric], metrics["test"][metric]],
label=metric,
)
plt.legend()
plt.title("Training and testing metrics")
plt.show()
##