Source code for soft_search.data.soft_search_2022

#!/usr/bin/env python


from pathlib import Path
from typing import Dict, List, Optional, Union

import pandas as pd
import requests
from tqdm.contrib.concurrent import thread_map

from ..constants import NSFFields, PredictionLabels

###############################################################################

SOFT_SEARCH_2022_DS_PATH = Path(__file__).parent / "soft-search-2022-labelled.parquet"
SOFT_SEARCH_2022_IRR_PATH = Path(__file__).parent / "soft-search-2022-irr.parquet"
GH_REPOS_WITH_NSF_REF_2022_PATH = (
    Path(__file__).parent / "gh-search-results-duplicates-removed.csv"
)
GH_REPOS_LINKED_TO_NSF_IDS_PATH = (
    Path(__file__).parent / "linked-github-nsf-results.parquet"
)
LINDSEY_GH_REPOS_ANNOTATION_PATH = (
    Path(__file__).parent / "gh-repo-annotations-lindsey.csv"
)
RICHARD_GH_REPOS_ANNOTATION_PATH = (
    Path(__file__).parent / "gh-repo-annotations-richard.csv"
)



[docs]
class SoftSearch2022IRRDatasetFields:
    annotator = "annotator"
    github_link = "github_link"
    include_in_definition = "include_in_definition"
    notes = "notes"
    most_recent_commit_datetime = "most_recent_commit_datetime"



ALL_SOFT_SEARCH_2022_IRR_DATASET_FIELDS = [
    getattr(SoftSearch2022IRRDatasetFields, a)
    for a in dir(SoftSearch2022IRRDatasetFields)
    if "__" not in a
]



[docs]
class SoftSearch2022DatasetFields:
    github_link = "github_link"
    nsf_award_id = "nsf_award_id"
    nsf_award_link = "nsf_award_link"
    abstract_text = "abstract_text"
    project_outcomes = "project_outcomes"
    label = "label"
    from_template_repo = "from_template_repo"
    is_a_fork = "is_a_fork"



ALL_SOFT_SEARCH_2022_DATASET_FIELDS = [
    getattr(SoftSearch2022DatasetFields, a)
    for a in dir(SoftSearch2022DatasetFields)
    if "__" not in a
]

###############################################################################



[docs]
def load_github_repos_with_nsf_refs_2022() -> pd.DataFrame:
    """
    Load the GitHub repositories with references to NSF dataset.

    Created via the `get-github-repositories-with-nsf-ref` bin script.

    Returns
    -------
    pd.DataFrame
        The dataset.
    """
    return pd.read_csv(GH_REPOS_WITH_NSF_REF_2022_PATH)



def _prepare_soft_search_2022_irr(
    all_annos: List[Union[str, Path, pd.DataFrame]],
) -> Path:
    """
    Prepare and store sample annotation data for use in future IRR calculation.

    Parameters
    ----------
    all_annos: Union[str, Path, pd.DataFrame]
        A list of paths or in-memory pandas DataFrames for the raw
        manually labelled data from annotator one used for calculating
        inter-rater reliability.
        Only CSV file format is supported when providing a file paths.

    Returns
    -------
    Path
        The Path to the prepared and stored parquet file.
    """
    # Fix data
    exclude_include_values_map = {
        "exclude": "exclude",
        "include": "include",
        "include ": "include",
        "incldue": "include",
        "exclude ": "exclude",
        "excude": "exclude",
        "include?": "include",
    }

    # Selected data
    columns_subset_frames: List[pd.DataFrame] = []
    for i, anno in enumerate(all_annos):
        # Load the data
        annotator_label: Union[str, int]
        if isinstance(anno, (str, Path)):
            anno_data = pd.read_csv(anno)
            annotator_label = Path(anno).with_suffix("").name
        else:
            anno_data = anno
            annotator_label = i

        # Drop duplicate "notes" column before rename
        anno_data = anno_data.drop(columns=["notes"])

        # Rename columns
        anno_data = anno_data.rename(
            columns={
                "include/exclude": (
                    SoftSearch2022IRRDatasetFields.include_in_definition
                ),
                "link": SoftSearch2022IRRDatasetFields.github_link,
                "Notes (justifications) ": SoftSearch2022IRRDatasetFields.notes,
                "most_recent_commit_datetime": (
                    SoftSearch2022IRRDatasetFields.most_recent_commit_datetime
                ),
            }
        )

        # Subset columns
        subset = anno_data[
            [
                col
                for col in ALL_SOFT_SEARCH_2022_IRR_DATASET_FIELDS
                if col is not SoftSearch2022IRRDatasetFields.annotator
            ]
        ]

        # Sort by link to have semi-consistent order
        subset = subset.sort_values(
            by=[
                SoftSearch2022IRRDatasetFields.github_link,
            ],
        )

        # Rename values
        subset[SoftSearch2022IRRDatasetFields.include_in_definition] = subset[
            SoftSearch2022IRRDatasetFields.include_in_definition
        ].map(exclude_include_values_map)

        # Add column for annotator
        subset[SoftSearch2022IRRDatasetFields.annotator] = annotator_label
        columns_subset_frames.append(subset)

    combined = pd.concat(columns_subset_frames).reset_index(drop=True)
    combined.to_parquet(SOFT_SEARCH_2022_IRR_PATH)
    return SOFT_SEARCH_2022_IRR_PATH



[docs]
def load_linked_github_repositories_with_nsf_awards_2022() -> pd.DataFrame:
    """
    Load the GitHub repositories linked to specific NSF award IDs dataset.

    Created via the `find-nsf-award-ids-in-github-readmes-and-link` bin script.

    Returns
    -------
    pd.DataFrame
        The dataset.
    """
    return pd.read_parquet(GH_REPOS_LINKED_TO_NSF_IDS_PATH)



def _prepare_soft_search_2022(
    linked_nsf_github_repos: Union[str, Path, pd.DataFrame] = (
        GH_REPOS_LINKED_TO_NSF_IDS_PATH
    ),
    lindsey_data: Union[str, Path, pd.DataFrame] = LINDSEY_GH_REPOS_ANNOTATION_PATH,
    richard_data: Union[str, Path, pd.DataFrame] = RICHARD_GH_REPOS_ANNOTATION_PATH,
) -> Path:
    """
    Prepare the soft search dataset for storage in the package.

    Merge various dataframes together. Fetch NSF fields for each NSF Award ID. Drop
    duplicates. Store to parquet in the project data archive.

    Parameters
    ----------
    linked_nsf_github_repos: Union[str, Path, pd.DataFrame]
        The path or in-memory pandas DataFrame for the linked GitHub repositories
        to the NSF Awards produced by the
        `find-nsf-award-ids-in-github-readmes-and-link` script.
        Only Parquet file format is supported when providing a file path.
    lindsey_data: Union[str, Path, pd.DataFrame]
        The path or in-memory pandas DataFrame for the raw manually labelled
        data from Lindsey. Only CSV file format is supported when providing a file path.
    richard_data: Union[str, Path, pd.DataFrame]
        The path or in-memory pandas DataFrame for the raw manually labelled
        data from Richard. Only CSV file format is supported when providing a file path.

    Returns
    -------
    Path
        The Path to the prepared and stored parquet file.
    """
    # Read data
    if isinstance(linked_nsf_github_repos, (str, Path)):
        linked_nsf_github_df = pd.read_parquet(linked_nsf_github_repos)
    else:
        linked_nsf_github_df = linked_nsf_github_repos
    if isinstance(lindsey_data, (str, Path)):
        lindsey_df = pd.read_csv(lindsey_data)
    else:
        lindsey_df = lindsey_data
    if isinstance(richard_data, (str, Path)):
        richard_df = pd.read_csv(richard_data)
    else:
        richard_df = richard_data

    # Clean Lindsey
    lindsey_df = lindsey_df[["include/exclude", "link"]]
    lindsey_df = lindsey_df[~lindsey_df["include/exclude"].isna()]

    # Clean Richard
    richard_df = richard_df[["include/exclude", "link"]]
    richard_df = richard_df[~richard_df["include/exclude"].isna()]

    # Join and clean after merge
    data_lindsey = lindsey_df.join(
        linked_nsf_github_df.set_index("github_link"),
        on="link",
    )
    data_richard = richard_df.join(
        linked_nsf_github_df.set_index("github_link"),
        on="link",
    )
    data = pd.concat([data_lindsey, data_richard])
    data = data.drop_duplicates(subset=["link", "nsf_award_id"])
    data = data.dropna(subset=["nsf_award_id"])

    # Get both the abstract and the project outcomes report
    get_nsf_fields = ",".join(
        [
            NSFFields.abstractText,
            NSFFields.projectOutComesReport,
        ]
    )

    def _thread_text_prediction_cols(
        award_id: int,
    ) -> Optional[Dict[str, Union[int, str]]]:
        response_data = requests.get(
            f"https://api.nsf.gov/"
            f"services/v1/awards/{award_id}.json"
            f"?printFields={get_nsf_fields}"
        ).json()

        # Handle data existance
        if "response" not in response_data:
            return None
        response_subset = response_data["response"]

        if "award" not in response_subset:
            return None
        award_data = response_subset["award"]

        if len(award_data) == 0:
            return None
        single_award = award_data[0]

        # Return the award id and the abstract text
        return {
            "award_id": award_id,
            "abstract_text": single_award[NSFFields.abstractText],
            "project_outcomes": single_award.get(NSFFields.projectOutComesReport, None),
        }

    # Thread gather texts
    abstract_texts_list = thread_map(
        _thread_text_prediction_cols,
        data.nsf_award_id.unique(),
        desc="Getting NSF Award Abstracts",
    )

    # Filter failed values
    extra_items = pd.DataFrame([at for at in abstract_texts_list if at is not None])

    # Join to original data frame
    data = data.join(extra_items.set_index("award_id"), on="nsf_award_id")

    # Drop any rows that are missing abstract text
    data = data.dropna(subset=["abstract_text"])

    # Rename to standard set
    data = data.rename(
        columns={
            "include/exclude": SoftSearch2022DatasetFields.label,
            "link": SoftSearch2022DatasetFields.github_link,
            "nsf_link": SoftSearch2022DatasetFields.nsf_award_link,
        },
    )

    # Replace include and exclude with int
    data[SoftSearch2022DatasetFields.label] = data[
        SoftSearch2022DatasetFields.label
    ].replace(
        {
            "exclude": PredictionLabels.SoftwareNotPredicted,
            "include": PredictionLabels.SoftwarePredicted,
        }
    )

    # We want to drop duplicates of nsf award id
    # There should only be 1 example of an NSF award ID
    # no need to duplicate the examples
    # NOTE: before we drop duplicates we sort by label descending so that
    # "if an nsf award id has a label of `software-predicted`" it retains that label
    # i.e. prior to this line, an award may have multiple examples in the dataset
    # some of those examples produce software and some do not produce software
    # if ANY of those examples produce software, we want to label the award as producing
    # software
    data = data.sort_values(by=["label"], ascending=False)
    data = data.drop_duplicates(subset=["nsf_award_id"])

    # Store to standard location
    data.to_parquet(SOFT_SEARCH_2022_DS_PATH)

    return SOFT_SEARCH_2022_DS_PATH



[docs]
def load_soft_search_2022_training() -> pd.DataFrame:
    """
    Load the Software Search 2022 manually labelled dataset.

    Returns
    -------
    pd.DataFrame
        The dataset.
    """
    return pd.read_parquet(SOFT_SEARCH_2022_DS_PATH)




[docs]
def load_soft_search_2022_training_irr() -> pd.DataFrame:
    """
    Load the Software Search 2022 Inter-Rater Reliability labelled dataset.

    Returns
    -------
    pd.DataFrame
        The dataset.
    """
    return pd.read_parquet(SOFT_SEARCH_2022_IRR_PATH)