Source code for soft_search.bin.generate_nsf_soft_search_2022_dataset

#!/usr/bin/env python

import argparse
import logging
import sys
import traceback
from pathlib import Path
from typing import List

import pandas as pd
from tqdm import tqdm

from soft_search import nsf
from soft_search.constants import ALL_NSF_PROGRAMS, NSFFields
from soft_search.label import (
    load_tfidf_logit_for_prediction_from_abstract,
    load_tfidf_logit_for_prediction_from_outcomes,
)

###############################################################################


[docs] class Args(argparse.Namespace): def __init__(self) -> None: self.__parse() def __parse(self) -> None: p = argparse.ArgumentParser( prog="generate-nsf-soft-search-2022-dataset", description=( "Get the NSF Awards from the last 12 years, predict " "if they produced software from both the abstract and the " "project outcomes report and archive the results." ), ) p.add_argument( "-s", "--start-date", dest="start_date", default="2010-01-01", type=str, help="ISO format string with the date to start gathering awards for.", ) p.add_argument( "-e", "--end-date", dest="end_date", default="2023-01-01", type=str, help="ISO format string with the date to end gathering awards for.", ) p.add_argument( "-o", "--outfile", dest="outfile", default=Path("./nsf-soft-search-2022.csv"), type=Path, help="The path to store the dataset CSV (and Parquet with the same name).", ) p.add_argument( "--debug", dest="debug", action="store_true", help="Run with debug logging.", ) p.parse_args(namespace=self)
###############################################################################
[docs] def main() -> None: # Get args args = Args() # Determine log level if args.debug: log_level = logging.DEBUG else: log_level = logging.INFO # Setup logging logging.basicConfig( level=log_level, format="[%(levelname)4s: %(module)s:%(lineno)4s %(asctime)s] %(message)s", ) log = logging.getLogger(__name__) # Get all program chunks # Concat # Store to CSV try: # Get chunks program_chunks: List[pd.DataFrame] = [] for program in tqdm(ALL_NSF_PROGRAMS, desc="Iterating major programs..."): log.info(f"Gathering {program} dataset chunk...") chunk = nsf.get_nsf_dataset( start_date=args.start_date, end_date=args.end_date, program_name=program, require_project_outcomes_doc=False, ) chunk["majorProgram"] = program program_chunks.append(chunk) # Concat and report size awards = ( pd.concat(program_chunks, ignore_index=True) .drop_duplicates(NSFFields.id_) .dropna(subset=[NSFFields.abstractText]) .reset_index(drop=True) ) log.info(f"Total awards found: {len(awards)}") # Make predictions log.info("Loading model and predicting software production from abstract.") model = load_tfidf_logit_for_prediction_from_abstract() awards["prediction_from_abstract"] = model.predict( awards[NSFFields.abstractText], ) log.info("Loading model and predicting software production from outcomes.") model = load_tfidf_logit_for_prediction_from_outcomes() # Subset to just awards with outcomes outcomes_awards = awards.dropna(subset=[NSFFields.projectOutComesReport]) outcomes_awards["prediction_from_outcomes"] = model.predict( outcomes_awards[NSFFields.projectOutComesReport], ) outcomes_awards = outcomes_awards[[NSFFields.id_, "prediction_from_outcomes"]] # Join the outcomes predictions back to the full set awards = awards.join(outcomes_awards.set_index(NSFFields.id_), on=NSFFields.id_) # Store outfile = args.outfile.resolve() awards.to_csv(outfile, index=False) awards.to_parquet(outfile.with_suffix(".parquet")) log.info(f"Awards dataset stored to: '{outfile}'.") except Exception as e: log.error("=============================================") log.error("\n\n" + traceback.format_exc()) log.error("=============================================") log.error("\n\n" + str(e) + "\n") log.error("=============================================") sys.exit(1)
############################################################################### # Allow caller to directly run this module (usually in development scenarios) if __name__ == "__main__": main()