Source code for soft_search.label.model_selection

#!/usr/bin/env python

import os
import shutil

import pandas as pd
from sklearn.model_selection import train_test_split

from ..data import _DATA_DIR
from ..data.soft_search_2022 import (
    SoftSearch2022DatasetFields,
    load_soft_search_2022_training,
)
from ..seed import set_seed
from . import regex, semantic_logit, tfidf_logit, transformer
from .tfidf_logit import (
    ABSTRACT_SOURCE_TFIDF_LOGIT_PATH,
    OUTCOMES_SOURCE_TFIDF_LOGIT_PATH,
)
from .transformer import HUGGINGFACE_HUB_SOFT_SEARCH_MODEL

###############################################################################


[docs] def fit_and_eval_all_models( test_size: float = 0.2, seed: int = 0, archive: bool = False, train_transformer: bool = True, push_transformer: bool = False, ) -> pd.DataFrame: # Set global seed set_seed(seed) # Load core data data = load_soft_search_2022_training() # Run both models (prediction from abstract and prediction from outcomes) results = [] for text_col in [ SoftSearch2022DatasetFields.abstract_text, SoftSearch2022DatasetFields.project_outcomes, ]: # Subset / drop na for this text col subset = data.dropna(subset=[text_col]) # Store the "predictive_source" column value predictive_source = {"predictive_source": text_col.replace("_", "-")} # Split the data train_df, test_df = train_test_split( subset, test_size=test_size, stratify=subset[SoftSearch2022DatasetFields.label], ) # Run each model # Regex regex_metrics = regex.train( test_df, text_col=text_col, label_col=SoftSearch2022DatasetFields.label, ) results.append( { **predictive_source, **regex_metrics.to_dict(), } ) # TFIDF if text_col == SoftSearch2022DatasetFields.abstract_text: tfidf_output_path = ABSTRACT_SOURCE_TFIDF_LOGIT_PATH else: tfidf_output_path = OUTCOMES_SOURCE_TFIDF_LOGIT_PATH tfidf_logit_pipeline_path, _, tfidf_logit_metrics = tfidf_logit.train( train_df=train_df, test_df=test_df, text_col=text_col, label_col=SoftSearch2022DatasetFields.label, model_storage_path=tfidf_output_path, ) results.append( { **predictive_source, **tfidf_logit_metrics.to_dict(), } ) # Archive # We only save the tfidf-logit pipeline because it typically performs the best if archive: shutil.copy2( tfidf_logit_pipeline_path, _DATA_DIR, ) # Semantic _, _, semantic_logit_metrics = semantic_logit.train( train_df=train_df, test_df=test_df, text_col=text_col, label_col=SoftSearch2022DatasetFields.label, ) results.append( { **predictive_source, **semantic_logit_metrics.to_dict(), } ) # Transformer if train_transformer: extra_training_args = {} if push_transformer: extra_training_args = { "push_to_hub": True, "hub_model_id": HUGGINGFACE_HUB_SOFT_SEARCH_MODEL, "hub_strategy": "end", "hub_token": os.environ["HUGGINGFACE_TOKEN"], } _, _, _, transformer_metrics = transformer.train( train_df=train_df, test_df=test_df, extra_training_args=extra_training_args, text_col=text_col, label_col=SoftSearch2022DatasetFields.label, ) results.append( { **predictive_source, **transformer_metrics.to_dict(), } ) # Create dataframe with metrics return ( pd.DataFrame(results) .sort_values(by="f1", ascending=False) .reset_index( drop=True, ) )