#!/usr/bin/env python
import logging
from functools import partial
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
import numpy as np
import pandas as pd
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
DataCollatorWithPadding,
EvalPrediction,
Trainer,
TrainingArguments,
pipeline,
)
from ..constants import DEFAULT_SEMANTIC_EMBEDDING_MODEL
from ..data.soft_search_2022 import SoftSearch2022DatasetFields
from ..metrics import EvaluationMetrics
if TYPE_CHECKING:
from datasets.arrow_dataset import Batch
from transformers.pipelines.base import Pipeline
from transformers.tokenization_utils_base import BatchEncoding
from transformers.trainer_utils import TrainOutput
###############################################################################
log = logging.getLogger(__name__)
###############################################################################
DEFAULT_SOFT_SEARCH_TRANSFORMER_PATH = Path("soft-search-transformer/").resolve()
TRANSFORMER_LABEL_COL = "transformer_label"
HUGGINGFACE_HUB_SOFT_SEARCH_MODEL = "evamaxfield/soft-search"
###############################################################################
[docs]
def train(
train_df: Union[str, Path, pd.DataFrame],
test_df: Union[str, Path, pd.DataFrame],
text_col: str = SoftSearch2022DatasetFields.abstract_text,
label_col: str = SoftSearch2022DatasetFields.label,
model_storage_path: Union[str, Path] = DEFAULT_SOFT_SEARCH_TRANSFORMER_PATH,
base_model: str = DEFAULT_SEMANTIC_EMBEDDING_MODEL,
extra_training_args: Optional[Dict[str, Any]] = None,
) -> Tuple[Path, Trainer, "TrainOutput", EvaluationMetrics]:
"""
Fine-tune a transformer model to classify the provided labels.
This function will both train and evaluate the performance of the
fine-tuned transformer.
Parameters
----------
train_df: Union[str, Path, pd.DataFrame]
The data to use for training.
Only CSV file format is supported when providing a file path.
test_df: Union[str, Path, pd.DataFrame]
The data to use for training.
Only CSV file format is supported when providing a file path.
text_col: str
The column name which contains the raw text.
Default: "abstract_text"
label_col: str
The column name which contains the labels.
Default: "label"
model_storage_path: Union[str, Path]
The path to store the model to.
Default: "soft-search-transformer/"
base_model: str
The base model to fine-tune.
Default: "distilbert-base-uncased-finetuned-sst-2-english"
extra_training_args: Optional[Dict[str, Any]]
Any extra arguments to pass to the Trainer object.
Returns
-------
Path
The path to the stored model.
Trainer
The Trainer object.
TrainOutput
The final output of the trainer.train() call.
EvaluationMetrics
The evaluation metrics.
Examples
--------
Example training from supplied manually labelled data.
>>> from soft_search.data import load_joined_soft_search_2022
>>> from soft_search.label import transformer
>>> from sklearn.model_selection import train_test_split
>>> df = load_joined_soft_search_2022()
>>> train, test = train_test_split(
... df,
... test_size=0.2,
... stratify=df["label"]
... )
>>> model = transformer.train(train)
See Also
--------
label
A function to apply a model across a pandas DataFrame.
"""
# Handle storage dir
model_storage_path = Path(model_storage_path).resolve()
# Read DataFrame
if isinstance(train_df, (str, Path)):
train_df = pd.read_csv(train_df)
# Read DataFrame
if isinstance(test_df, (str, Path)):
test_df = pd.read_csv(test_df)
# Rename cols
train_df = train_df.copy(deep=True)
train_df = train_df[[label_col, text_col]]
train_df = train_df.rename(columns={label_col: "label", text_col: "text"})
test_df = test_df.copy(deep=True)
test_df = test_df[[label_col, text_col]]
test_df = test_df.rename(columns={label_col: "label", text_col: "text"})
# Train and test should have the same label names
# only grab from train
label_names = train_df["label"].unique().tolist()
# Construct label to id and vice-versa LUTs
label2id, id2label = {}, {}
for i, label in enumerate(label_names):
label2id[label] = str(i)
id2label[str(i)] = label
# Cast to dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
train_dataset = train_dataset.class_encode_column("label")
test_dataset = test_dataset.class_encode_column("label")
# Preprocess
tokenizer = AutoTokenizer.from_pretrained(base_model)
def preprocess_function(examples: "BatchEncoding") -> "Batch":
return tokenizer(examples["text"], truncation=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
# AutoModel
model = AutoModelForSequenceClassification.from_pretrained(
base_model,
num_labels=len(id2label),
label2id=label2id,
id2label=id2label,
ignore_mismatched_sizes=True,
)
# Catch None extra args
if extra_training_args is None:
extra_training_args = {}
# Training Args
training_args = TrainingArguments(
output_dir=model_storage_path,
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=3e-5,
logging_steps=10,
load_best_model_at_end=True,
metric_for_best_model="f1",
per_device_train_batch_size=12,
per_device_eval_batch_size=12,
num_train_epochs=5,
weight_decay=0.01,
**extra_training_args,
)
# Compute accuracy metrics
acc_metric = load_metric("accuracy")
pre_metric = load_metric("precision")
rec_metric = load_metric("recall")
f1_metric = load_metric("f1")
def compute_metrics(eval_pred: EvalPrediction) -> Optional[Dict]:
predictions = np.argmax(eval_pred.predictions, axis=-1)
f1_score = f1_metric.compute(
predictions=predictions,
references=eval_pred.label_ids,
)
acc_score = acc_metric.compute(
predictions=predictions,
references=eval_pred.label_ids,
)
pre_score = pre_metric.compute(
predictions=predictions,
references=eval_pred.label_ids,
)
rec_score = rec_metric.compute(
predictions=predictions,
references=eval_pred.label_ids,
)
return {
**f1_score,
**acc_score,
**pre_score,
**rec_score,
}
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_test_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# Train
epoch_metrics = trainer.train()
raw_eval_metrics = trainer.evaluate(tokenized_test_dataset)
eval_metrics = EvaluationMetrics(
model="transformer",
accuracy=raw_eval_metrics["eval_accuracy"],
precision=raw_eval_metrics["eval_precision"],
recall=raw_eval_metrics["eval_recall"],
f1=raw_eval_metrics["eval_f1"],
)
# Store model
trainer.save_model()
return model_storage_path, trainer, epoch_metrics, eval_metrics
def _train_and_upload_transformer(seed: int = 0) -> Path:
import os
from ..data import load_soft_search_2022_training
from ..seed import set_seed
# Set global seed
set_seed(seed)
# Load data, train
df = load_soft_search_2022_training()
train_df, test_df = train_test_split(df, test_size=0.2)
model, _, _, _ = train(
train_df,
test_df,
extra_training_args={
"push_to_hub": True,
"hub_model_id": HUGGINGFACE_HUB_SOFT_SEARCH_MODEL,
"hub_strategy": "end",
"hub_token": os.environ["HUGGINGFACE_TOKEN"],
},
)
return model
def _apply_transformer(text: str, classifier: "Pipeline") -> str:
return classifier(text, truncation=True, top_k=1)[0]["label"]
[docs]
def label(
df: pd.DataFrame,
apply_column: str = SoftSearch2022DatasetFields.abstract_text,
label_column: str = TRANSFORMER_LABEL_COL,
model: Union[str, Path] = HUGGINGFACE_HUB_SOFT_SEARCH_MODEL,
) -> pd.DataFrame:
"""
In-place add a new column to the provided pandas DataFrame with a label
of software predicted or not using a trained transformer model.
Parameters
----------
df: pd.DataFrame
The pandas DataFrame to in-place add a column with the
software predicted outcome labels.
apply_column: str
The column to use for "prediction".
Default: "text"
label_column: str
The name of the column to add with outcome "prediction".
Default: "transformer_label"
model: Union[str, Path]
The path to the stored model.
Default: https://huggingface.co/evamaxfield/soft-search (latest CI model)
Returns
-------
pd.DataFrame
The same pandas DataFrame but with a new column added in-place containing
the software outcome prediction.
See Also
--------
soft_search.nsf.get_nsf_dataset
Function to get an NSF dataset for prediction.
Examples
--------
Example application to a new NSF dataset.
>>> from soft_search import constants, nsf
>>> from soft_search.label import transformer
>>> df = nsf.get_nsf_dataset(
... "2016-01-01",
... "2017-01-01",
... dataset_fields=[constants.NSFFields.abstractText],
... )
>>> predicted = transformer.label(
... df,
... apply_column=constants.NSFFields.abstractText,
... )
"""
# Load label pipeline
classifier = pipeline("text-classification", model=str(model), tokenizer=str(model))
# Partial func
apply_classifier = partial(_apply_transformer, classifier=classifier)
df[label_column] = df[apply_column].apply(apply_classifier)
return df