Source code for soft_search.label.transformer

#!/usr/bin/env python

import logging
from functools import partial
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union

import numpy as np
import pandas as pd
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    pipeline,
)

from ..constants import DEFAULT_SEMANTIC_EMBEDDING_MODEL
from ..data.soft_search_2022 import SoftSearch2022DatasetFields
from ..metrics import EvaluationMetrics

if TYPE_CHECKING:
    from datasets.arrow_dataset import Batch
    from transformers.pipelines.base import Pipeline
    from transformers.tokenization_utils_base import BatchEncoding
    from transformers.trainer_utils import TrainOutput

###############################################################################

log = logging.getLogger(__name__)

###############################################################################

DEFAULT_SOFT_SEARCH_TRANSFORMER_PATH = Path("soft-search-transformer/").resolve()
TRANSFORMER_LABEL_COL = "transformer_label"
HUGGINGFACE_HUB_SOFT_SEARCH_MODEL = "evamaxfield/soft-search"

###############################################################################


[docs] def train( train_df: Union[str, Path, pd.DataFrame], test_df: Union[str, Path, pd.DataFrame], text_col: str = SoftSearch2022DatasetFields.abstract_text, label_col: str = SoftSearch2022DatasetFields.label, model_storage_path: Union[str, Path] = DEFAULT_SOFT_SEARCH_TRANSFORMER_PATH, base_model: str = DEFAULT_SEMANTIC_EMBEDDING_MODEL, extra_training_args: Optional[Dict[str, Any]] = None, ) -> Tuple[Path, Trainer, "TrainOutput", EvaluationMetrics]: """ Fine-tune a transformer model to classify the provided labels. This function will both train and evaluate the performance of the fine-tuned transformer. Parameters ---------- train_df: Union[str, Path, pd.DataFrame] The data to use for training. Only CSV file format is supported when providing a file path. test_df: Union[str, Path, pd.DataFrame] The data to use for training. Only CSV file format is supported when providing a file path. text_col: str The column name which contains the raw text. Default: "abstract_text" label_col: str The column name which contains the labels. Default: "label" model_storage_path: Union[str, Path] The path to store the model to. Default: "soft-search-transformer/" base_model: str The base model to fine-tune. Default: "distilbert-base-uncased-finetuned-sst-2-english" extra_training_args: Optional[Dict[str, Any]] Any extra arguments to pass to the Trainer object. Returns ------- Path The path to the stored model. Trainer The Trainer object. TrainOutput The final output of the trainer.train() call. EvaluationMetrics The evaluation metrics. Examples -------- Example training from supplied manually labelled data. >>> from soft_search.data import load_joined_soft_search_2022 >>> from soft_search.label import transformer >>> from sklearn.model_selection import train_test_split >>> df = load_joined_soft_search_2022() >>> train, test = train_test_split( ... df, ... test_size=0.2, ... stratify=df["label"] ... ) >>> model = transformer.train(train) See Also -------- label A function to apply a model across a pandas DataFrame. """ # Handle storage dir model_storage_path = Path(model_storage_path).resolve() # Read DataFrame if isinstance(train_df, (str, Path)): train_df = pd.read_csv(train_df) # Read DataFrame if isinstance(test_df, (str, Path)): test_df = pd.read_csv(test_df) # Rename cols train_df = train_df.copy(deep=True) train_df = train_df[[label_col, text_col]] train_df = train_df.rename(columns={label_col: "label", text_col: "text"}) test_df = test_df.copy(deep=True) test_df = test_df[[label_col, text_col]] test_df = test_df.rename(columns={label_col: "label", text_col: "text"}) # Train and test should have the same label names # only grab from train label_names = train_df["label"].unique().tolist() # Construct label to id and vice-versa LUTs label2id, id2label = {}, {} for i, label in enumerate(label_names): label2id[label] = str(i) id2label[str(i)] = label # Cast to dataset train_dataset = Dataset.from_pandas(train_df) test_dataset = Dataset.from_pandas(test_df) train_dataset = train_dataset.class_encode_column("label") test_dataset = test_dataset.class_encode_column("label") # Preprocess tokenizer = AutoTokenizer.from_pretrained(base_model) def preprocess_function(examples: "BatchEncoding") -> "Batch": return tokenizer(examples["text"], truncation=True) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True) tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True) # AutoModel model = AutoModelForSequenceClassification.from_pretrained( base_model, num_labels=len(id2label), label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True, ) # Catch None extra args if extra_training_args is None: extra_training_args = {} # Training Args training_args = TrainingArguments( output_dir=model_storage_path, evaluation_strategy="epoch", save_strategy="epoch", learning_rate=3e-5, logging_steps=10, load_best_model_at_end=True, metric_for_best_model="f1", per_device_train_batch_size=12, per_device_eval_batch_size=12, num_train_epochs=5, weight_decay=0.01, **extra_training_args, ) # Compute accuracy metrics acc_metric = load_metric("accuracy") pre_metric = load_metric("precision") rec_metric = load_metric("recall") f1_metric = load_metric("f1") def compute_metrics(eval_pred: EvalPrediction) -> Optional[Dict]: predictions = np.argmax(eval_pred.predictions, axis=-1) f1_score = f1_metric.compute( predictions=predictions, references=eval_pred.label_ids, ) acc_score = acc_metric.compute( predictions=predictions, references=eval_pred.label_ids, ) pre_score = pre_metric.compute( predictions=predictions, references=eval_pred.label_ids, ) rec_score = rec_metric.compute( predictions=predictions, references=eval_pred.label_ids, ) return { **f1_score, **acc_score, **pre_score, **rec_score, } # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_train_dataset, eval_dataset=tokenized_test_dataset, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Train epoch_metrics = trainer.train() raw_eval_metrics = trainer.evaluate(tokenized_test_dataset) eval_metrics = EvaluationMetrics( model="transformer", accuracy=raw_eval_metrics["eval_accuracy"], precision=raw_eval_metrics["eval_precision"], recall=raw_eval_metrics["eval_recall"], f1=raw_eval_metrics["eval_f1"], ) # Store model trainer.save_model() return model_storage_path, trainer, epoch_metrics, eval_metrics
def _train_and_upload_transformer(seed: int = 0) -> Path: import os from ..data import load_soft_search_2022_training from ..seed import set_seed # Set global seed set_seed(seed) # Load data, train df = load_soft_search_2022_training() train_df, test_df = train_test_split(df, test_size=0.2) model, _, _, _ = train( train_df, test_df, extra_training_args={ "push_to_hub": True, "hub_model_id": HUGGINGFACE_HUB_SOFT_SEARCH_MODEL, "hub_strategy": "end", "hub_token": os.environ["HUGGINGFACE_TOKEN"], }, ) return model def _apply_transformer(text: str, classifier: "Pipeline") -> str: return classifier(text, truncation=True, top_k=1)[0]["label"]
[docs] def label( df: pd.DataFrame, apply_column: str = SoftSearch2022DatasetFields.abstract_text, label_column: str = TRANSFORMER_LABEL_COL, model: Union[str, Path] = HUGGINGFACE_HUB_SOFT_SEARCH_MODEL, ) -> pd.DataFrame: """ In-place add a new column to the provided pandas DataFrame with a label of software predicted or not using a trained transformer model. Parameters ---------- df: pd.DataFrame The pandas DataFrame to in-place add a column with the software predicted outcome labels. apply_column: str The column to use for "prediction". Default: "text" label_column: str The name of the column to add with outcome "prediction". Default: "transformer_label" model: Union[str, Path] The path to the stored model. Default: https://huggingface.co/evamaxfield/soft-search (latest CI model) Returns ------- pd.DataFrame The same pandas DataFrame but with a new column added in-place containing the software outcome prediction. See Also -------- soft_search.nsf.get_nsf_dataset Function to get an NSF dataset for prediction. Examples -------- Example application to a new NSF dataset. >>> from soft_search import constants, nsf >>> from soft_search.label import transformer >>> df = nsf.get_nsf_dataset( ... "2016-01-01", ... "2017-01-01", ... dataset_fields=[constants.NSFFields.abstractText], ... ) >>> predicted = transformer.label( ... df, ... apply_column=constants.NSFFields.abstractText, ... ) """ # Load label pipeline classifier = pipeline("text-classification", model=str(model), tokenizer=str(model)) # Partial func apply_classifier = partial(_apply_transformer, classifier=classifier) df[label_column] = df[apply_column].apply(apply_classifier) return df