Source code for soft_search.label.model_selection

#!/usr/bin/env python

import os
import shutil

import pandas as pd
from sklearn.model_selection import train_test_split

from ..data import _DATA_DIR
from ..data.soft_search_2022 import (
    SoftSearch2022DatasetFields,
    load_soft_search_2022_training,
)
from ..seed import set_seed
from . import regex, semantic_logit, tfidf_logit, transformer
from .tfidf_logit import (
    ABSTRACT_SOURCE_TFIDF_LOGIT_PATH,
    OUTCOMES_SOURCE_TFIDF_LOGIT_PATH,
)
from .transformer import HUGGINGFACE_HUB_SOFT_SEARCH_MODEL

###############################################################################



[docs]
def fit_and_eval_all_models(
    test_size: float = 0.2,
    seed: int = 0,
    archive: bool = False,
    train_transformer: bool = True,
    push_transformer: bool = False,
) -> pd.DataFrame:
    # Set global seed
    set_seed(seed)

    # Load core data
    data = load_soft_search_2022_training()

    # Run both models (prediction from abstract and prediction from outcomes)
    results = []
    for text_col in [
        SoftSearch2022DatasetFields.abstract_text,
        SoftSearch2022DatasetFields.project_outcomes,
    ]:
        # Subset / drop na for this text col
        subset = data.dropna(subset=[text_col])

        # Store the "predictive_source" column value
        predictive_source = {"predictive_source": text_col.replace("_", "-")}

        # Split the data
        train_df, test_df = train_test_split(
            subset,
            test_size=test_size,
            stratify=subset[SoftSearch2022DatasetFields.label],
        )

        # Run each model
        # Regex
        regex_metrics = regex.train(
            test_df,
            text_col=text_col,
            label_col=SoftSearch2022DatasetFields.label,
        )
        results.append(
            {
                **predictive_source,
                **regex_metrics.to_dict(),
            }
        )

        # TFIDF
        if text_col == SoftSearch2022DatasetFields.abstract_text:
            tfidf_output_path = ABSTRACT_SOURCE_TFIDF_LOGIT_PATH
        else:
            tfidf_output_path = OUTCOMES_SOURCE_TFIDF_LOGIT_PATH

        tfidf_logit_pipeline_path, _, tfidf_logit_metrics = tfidf_logit.train(
            train_df=train_df,
            test_df=test_df,
            text_col=text_col,
            label_col=SoftSearch2022DatasetFields.label,
            model_storage_path=tfidf_output_path,
        )
        results.append(
            {
                **predictive_source,
                **tfidf_logit_metrics.to_dict(),
            }
        )

        # Archive
        # We only save the tfidf-logit pipeline because it typically performs the best
        if archive:
            shutil.copy2(
                tfidf_logit_pipeline_path,
                _DATA_DIR,
            )

        # Semantic
        _, _, semantic_logit_metrics = semantic_logit.train(
            train_df=train_df,
            test_df=test_df,
            text_col=text_col,
            label_col=SoftSearch2022DatasetFields.label,
        )
        results.append(
            {
                **predictive_source,
                **semantic_logit_metrics.to_dict(),
            }
        )

        # Transformer
        if train_transformer:
            extra_training_args = {}
            if push_transformer:
                extra_training_args = {
                    "push_to_hub": True,
                    "hub_model_id": HUGGINGFACE_HUB_SOFT_SEARCH_MODEL,
                    "hub_strategy": "end",
                    "hub_token": os.environ["HUGGINGFACE_TOKEN"],
                }

            _, _, _, transformer_metrics = transformer.train(
                train_df=train_df,
                test_df=test_df,
                extra_training_args=extra_training_args,
                text_col=text_col,
                label_col=SoftSearch2022DatasetFields.label,
            )

            results.append(
                {
                    **predictive_source,
                    **transformer_metrics.to_dict(),
                }
            )

    # Create dataframe with metrics
    return (
        pd.DataFrame(results)
        .sort_values(by="f1", ascending=False)
        .reset_index(
            drop=True,
        )
    )