Source code for soft_search.label.regex

#!/usr/bin/env python

import re
from pathlib import Path
from typing import Union

import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from ..constants import PredictionLabels
from ..data.soft_search_2022 import SoftSearch2022DatasetFields
from ..metrics import EvaluationMetrics

###############################################################################
# Constants

SOFTWARE_LIKE_PATTERNS = (
    r".*(?:software(?:\s(?:code|tool|suite|program|application|framework)s?|"
    r"(?:binar|librar)(?:y|ies))?|algorithms?|tools?).*"
)
COMPILED_SOFTWARE_LIKE_PATTERNS = re.compile(SOFTWARE_LIKE_PATTERNS)
REGEX_LABEL_COL = "regex_match"

###############################################################################


def _apply_regex(text: str) -> str:
    # Try match
    match_or_none = re.match(COMPILED_SOFTWARE_LIKE_PATTERNS, text)

    # Found
    if match_or_none:
        return PredictionLabels.SoftwarePredicted

    # Not Found
    return PredictionLabels.SoftwareNotPredicted


[docs] def train( df: Union[str, Path, pd.DataFrame], text_col: str = SoftSearch2022DatasetFields.abstract_text, label_col: str = SoftSearch2022DatasetFields.label, ) -> EvaluationMetrics: # Read DataFrame if isinstance(df, (str, Path)): df = pd.read_csv(df) # Eval preds = df[text_col].apply(_apply_regex).to_numpy() pre, rec, f1, _ = precision_recall_fscore_support( df[label_col], preds, average="weighted", ) acc = accuracy_score(df[label_col], preds) return EvaluationMetrics( model="regex", precision=pre, recall=rec, f1=f1, accuracy=acc, )
[docs] def label( df: pd.DataFrame, apply_column: str = "text", label_column: str = REGEX_LABEL_COL, ) -> pd.DataFrame: """ In-place add a new column to the provided pandas DataFrame with a label of software predicted or not solely based off a regex match for various software-like and adjacent terminology. Parameters ---------- df: pd.DataFrame The pandas DataFrame to in-place add a column with the regex matched software outcome labels. apply_column: str The column to use for "prediction". Default: "text" label_column: str The name of the column to add with outcome "prediction". Default: "regex_match" Returns ------- pd.DataFrame The same pandas DataFrame but with a new column added in-place containing the software outcome "prediction". See Also -------- soft_search.nsf.get_nsf_dataset Function to get an NSF dataset for prediction. """ df[label_column] = df[apply_column].apply(_apply_regex) return df