Source code for soft_search.label.tfidf_logit

#!/usr/bin/env python

import logging
import pickle
from pathlib import Path
from typing import Tuple, Union

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.pipeline import Pipeline, make_pipeline

from ..data import _DATA_DIR
from ..data.soft_search_2022 import SoftSearch2022DatasetFields
from ..metrics import EvaluationMetrics

###############################################################################

ABSTRACT_SOURCE_TFIDF_LOGIT_PATH = Path(
    "soft-search-tfidf-logit-from-abstract.pkl"
).resolve()
OUTCOMES_SOURCE_TFIDF_LOGIT_PATH = Path(
    "soft-search-tfidf-logit-from-outcomes.pkl"
).resolve()
ARCHIVED_SOFT_SEARCH_ABSTRACT_SOURCE_TFIDF_LOGIT_PATH = (
    _DATA_DIR / ABSTRACT_SOURCE_TFIDF_LOGIT_PATH.name
)
ARCHIVED_SOFT_SEARCH_OUTCOMES_SOURCE_TFIDF_LOGIT_PATH = (
    _DATA_DIR / OUTCOMES_SOURCE_TFIDF_LOGIT_PATH.name
)

###############################################################################

log = logging.getLogger(__name__)

###############################################################################



[docs]
def train(
    train_df: Union[str, Path, pd.DataFrame],
    test_df: Union[str, Path, pd.DataFrame],
    text_col: str = SoftSearch2022DatasetFields.abstract_text,
    label_col: str = SoftSearch2022DatasetFields.label,
    model_storage_path: Union[str, Path] = ABSTRACT_SOURCE_TFIDF_LOGIT_PATH,
) -> Tuple[Path, Pipeline, EvaluationMetrics]:
    # Handle storage dir
    model_storage_path = Path(model_storage_path).resolve()

    # Read DataFrame
    if isinstance(train_df, (str, Path)):
        train_df = pd.read_csv(train_df)
    # Read DataFrame
    if isinstance(test_df, (str, Path)):
        test_df = pd.read_csv(test_df)

    # Build the pipeline
    pipeline = make_pipeline(
        TfidfVectorizer(
            strip_accents="unicode",
            lowercase=True,
            stop_words="english",
        ),
        LogisticRegressionCV(max_iter=10000),
    )

    # Fit the pipeline
    pipeline.fit(train_df[text_col], train_df[label_col])

    # Save the pipeline
    with open(model_storage_path, "wb") as open_f:
        pickle.dump(pipeline, open_f)

    # Eval
    preds = pipeline.predict(test_df[text_col])
    pre, rec, f1, _ = precision_recall_fscore_support(
        test_df[label_col],
        preds,
        average="weighted",
    )
    acc = accuracy_score(test_df[label_col], preds)
    return (
        model_storage_path,
        pipeline,
        EvaluationMetrics(
            model="tfidf-logit",
            precision=pre,
            recall=rec,
            f1=f1,
            accuracy=acc,
        ),
    )




[docs]
def label() -> None:
    pass