Source code for soft_search.data.soft_search_2022

#!/usr/bin/env python


from pathlib import Path
from typing import Dict, List, Optional, Union

import pandas as pd
import requests
from tqdm.contrib.concurrent import thread_map

from ..constants import NSFFields, PredictionLabels

###############################################################################

SOFT_SEARCH_2022_DS_PATH = Path(__file__).parent / "soft-search-2022-labelled.parquet"
SOFT_SEARCH_2022_IRR_PATH = Path(__file__).parent / "soft-search-2022-irr.parquet"
GH_REPOS_WITH_NSF_REF_2022_PATH = (
    Path(__file__).parent / "gh-search-results-duplicates-removed.csv"
)
GH_REPOS_LINKED_TO_NSF_IDS_PATH = (
    Path(__file__).parent / "linked-github-nsf-results.parquet"
)
LINDSEY_GH_REPOS_ANNOTATION_PATH = (
    Path(__file__).parent / "gh-repo-annotations-lindsey.csv"
)
RICHARD_GH_REPOS_ANNOTATION_PATH = (
    Path(__file__).parent / "gh-repo-annotations-richard.csv"
)


[docs] class SoftSearch2022IRRDatasetFields: annotator = "annotator" github_link = "github_link" include_in_definition = "include_in_definition" notes = "notes" most_recent_commit_datetime = "most_recent_commit_datetime"
ALL_SOFT_SEARCH_2022_IRR_DATASET_FIELDS = [ getattr(SoftSearch2022IRRDatasetFields, a) for a in dir(SoftSearch2022IRRDatasetFields) if "__" not in a ]
[docs] class SoftSearch2022DatasetFields: github_link = "github_link" nsf_award_id = "nsf_award_id" nsf_award_link = "nsf_award_link" abstract_text = "abstract_text" project_outcomes = "project_outcomes" label = "label" from_template_repo = "from_template_repo" is_a_fork = "is_a_fork"
ALL_SOFT_SEARCH_2022_DATASET_FIELDS = [ getattr(SoftSearch2022DatasetFields, a) for a in dir(SoftSearch2022DatasetFields) if "__" not in a ] ###############################################################################
[docs] def load_github_repos_with_nsf_refs_2022() -> pd.DataFrame: """ Load the GitHub repositories with references to NSF dataset. Created via the `get-github-repositories-with-nsf-ref` bin script. Returns ------- pd.DataFrame The dataset. """ return pd.read_csv(GH_REPOS_WITH_NSF_REF_2022_PATH)
def _prepare_soft_search_2022_irr( all_annos: List[Union[str, Path, pd.DataFrame]], ) -> Path: """ Prepare and store sample annotation data for use in future IRR calculation. Parameters ---------- all_annos: Union[str, Path, pd.DataFrame] A list of paths or in-memory pandas DataFrames for the raw manually labelled data from annotator one used for calculating inter-rater reliability. Only CSV file format is supported when providing a file paths. Returns ------- Path The Path to the prepared and stored parquet file. """ # Fix data exclude_include_values_map = { "exclude": "exclude", "include": "include", "include ": "include", "incldue": "include", "exclude ": "exclude", "excude": "exclude", "include?": "include", } # Selected data columns_subset_frames: List[pd.DataFrame] = [] for i, anno in enumerate(all_annos): # Load the data annotator_label: Union[str, int] if isinstance(anno, (str, Path)): anno_data = pd.read_csv(anno) annotator_label = Path(anno).with_suffix("").name else: anno_data = anno annotator_label = i # Drop duplicate "notes" column before rename anno_data = anno_data.drop(columns=["notes"]) # Rename columns anno_data = anno_data.rename( columns={ "include/exclude": ( SoftSearch2022IRRDatasetFields.include_in_definition ), "link": SoftSearch2022IRRDatasetFields.github_link, "Notes (justifications) ": SoftSearch2022IRRDatasetFields.notes, "most_recent_commit_datetime": ( SoftSearch2022IRRDatasetFields.most_recent_commit_datetime ), } ) # Subset columns subset = anno_data[ [ col for col in ALL_SOFT_SEARCH_2022_IRR_DATASET_FIELDS if col is not SoftSearch2022IRRDatasetFields.annotator ] ] # Sort by link to have semi-consistent order subset = subset.sort_values( by=[ SoftSearch2022IRRDatasetFields.github_link, ], ) # Rename values subset[SoftSearch2022IRRDatasetFields.include_in_definition] = subset[ SoftSearch2022IRRDatasetFields.include_in_definition ].map(exclude_include_values_map) # Add column for annotator subset[SoftSearch2022IRRDatasetFields.annotator] = annotator_label columns_subset_frames.append(subset) combined = pd.concat(columns_subset_frames).reset_index(drop=True) combined.to_parquet(SOFT_SEARCH_2022_IRR_PATH) return SOFT_SEARCH_2022_IRR_PATH
[docs] def load_linked_github_repositories_with_nsf_awards_2022() -> pd.DataFrame: """ Load the GitHub repositories linked to specific NSF award IDs dataset. Created via the `find-nsf-award-ids-in-github-readmes-and-link` bin script. Returns ------- pd.DataFrame The dataset. """ return pd.read_parquet(GH_REPOS_LINKED_TO_NSF_IDS_PATH)
def _prepare_soft_search_2022( linked_nsf_github_repos: Union[str, Path, pd.DataFrame] = ( GH_REPOS_LINKED_TO_NSF_IDS_PATH ), lindsey_data: Union[str, Path, pd.DataFrame] = LINDSEY_GH_REPOS_ANNOTATION_PATH, richard_data: Union[str, Path, pd.DataFrame] = RICHARD_GH_REPOS_ANNOTATION_PATH, ) -> Path: """ Prepare the soft search dataset for storage in the package. Merge various dataframes together. Fetch NSF fields for each NSF Award ID. Drop duplicates. Store to parquet in the project data archive. Parameters ---------- linked_nsf_github_repos: Union[str, Path, pd.DataFrame] The path or in-memory pandas DataFrame for the linked GitHub repositories to the NSF Awards produced by the `find-nsf-award-ids-in-github-readmes-and-link` script. Only Parquet file format is supported when providing a file path. lindsey_data: Union[str, Path, pd.DataFrame] The path or in-memory pandas DataFrame for the raw manually labelled data from Lindsey. Only CSV file format is supported when providing a file path. richard_data: Union[str, Path, pd.DataFrame] The path or in-memory pandas DataFrame for the raw manually labelled data from Richard. Only CSV file format is supported when providing a file path. Returns ------- Path The Path to the prepared and stored parquet file. """ # Read data if isinstance(linked_nsf_github_repos, (str, Path)): linked_nsf_github_df = pd.read_parquet(linked_nsf_github_repos) else: linked_nsf_github_df = linked_nsf_github_repos if isinstance(lindsey_data, (str, Path)): lindsey_df = pd.read_csv(lindsey_data) else: lindsey_df = lindsey_data if isinstance(richard_data, (str, Path)): richard_df = pd.read_csv(richard_data) else: richard_df = richard_data # Clean Lindsey lindsey_df = lindsey_df[["include/exclude", "link"]] lindsey_df = lindsey_df[~lindsey_df["include/exclude"].isna()] # Clean Richard richard_df = richard_df[["include/exclude", "link"]] richard_df = richard_df[~richard_df["include/exclude"].isna()] # Join and clean after merge data_lindsey = lindsey_df.join( linked_nsf_github_df.set_index("github_link"), on="link", ) data_richard = richard_df.join( linked_nsf_github_df.set_index("github_link"), on="link", ) data = pd.concat([data_lindsey, data_richard]) data = data.drop_duplicates(subset=["link", "nsf_award_id"]) data = data.dropna(subset=["nsf_award_id"]) # Get both the abstract and the project outcomes report get_nsf_fields = ",".join( [ NSFFields.abstractText, NSFFields.projectOutComesReport, ] ) def _thread_text_prediction_cols( award_id: int, ) -> Optional[Dict[str, Union[int, str]]]: response_data = requests.get( f"https://api.nsf.gov/" f"services/v1/awards/{award_id}.json" f"?printFields={get_nsf_fields}" ).json() # Handle data existance if "response" not in response_data: return None response_subset = response_data["response"] if "award" not in response_subset: return None award_data = response_subset["award"] if len(award_data) == 0: return None single_award = award_data[0] # Return the award id and the abstract text return { "award_id": award_id, "abstract_text": single_award[NSFFields.abstractText], "project_outcomes": single_award.get(NSFFields.projectOutComesReport, None), } # Thread gather texts abstract_texts_list = thread_map( _thread_text_prediction_cols, data.nsf_award_id.unique(), desc="Getting NSF Award Abstracts", ) # Filter failed values extra_items = pd.DataFrame([at for at in abstract_texts_list if at is not None]) # Join to original data frame data = data.join(extra_items.set_index("award_id"), on="nsf_award_id") # Drop any rows that are missing abstract text data = data.dropna(subset=["abstract_text"]) # Rename to standard set data = data.rename( columns={ "include/exclude": SoftSearch2022DatasetFields.label, "link": SoftSearch2022DatasetFields.github_link, "nsf_link": SoftSearch2022DatasetFields.nsf_award_link, }, ) # Replace include and exclude with int data[SoftSearch2022DatasetFields.label] = data[ SoftSearch2022DatasetFields.label ].replace( { "exclude": PredictionLabels.SoftwareNotPredicted, "include": PredictionLabels.SoftwarePredicted, } ) # We want to drop duplicates of nsf award id # There should only be 1 example of an NSF award ID # no need to duplicate the examples # NOTE: before we drop duplicates we sort by label descending so that # "if an nsf award id has a label of `software-predicted`" it retains that label # i.e. prior to this line, an award may have multiple examples in the dataset # some of those examples produce software and some do not produce software # if ANY of those examples produce software, we want to label the award as producing # software data = data.sort_values(by=["label"], ascending=False) data = data.drop_duplicates(subset=["nsf_award_id"]) # Store to standard location data.to_parquet(SOFT_SEARCH_2022_DS_PATH) return SOFT_SEARCH_2022_DS_PATH
[docs] def load_soft_search_2022_training() -> pd.DataFrame: """ Load the Software Search 2022 manually labelled dataset. Returns ------- pd.DataFrame The dataset. """ return pd.read_parquet(SOFT_SEARCH_2022_DS_PATH)
[docs] def load_soft_search_2022_training_irr() -> pd.DataFrame: """ Load the Software Search 2022 Inter-Rater Reliability labelled dataset. Returns ------- pd.DataFrame The dataset. """ return pd.read_parquet(SOFT_SEARCH_2022_IRR_PATH)