Source code for soft_search.bin.generate_nsf_soft_search_2022_dataset

#!/usr/bin/env python

import argparse
import logging
import sys
import traceback
from pathlib import Path
from typing import List

import pandas as pd
from tqdm import tqdm

from soft_search import nsf
from soft_search.constants import ALL_NSF_PROGRAMS, NSFFields
from soft_search.label import (
    load_tfidf_logit_for_prediction_from_abstract,
    load_tfidf_logit_for_prediction_from_outcomes,
)

###############################################################################



[docs]
class Args(argparse.Namespace):
    def __init__(self) -> None:
        self.__parse()

    def __parse(self) -> None:
        p = argparse.ArgumentParser(
            prog="generate-nsf-soft-search-2022-dataset",
            description=(
                "Get the NSF Awards from the last 12 years, predict "
                "if they produced software from both the abstract and the "
                "project outcomes report and archive the results."
            ),
        )
        p.add_argument(
            "-s",
            "--start-date",
            dest="start_date",
            default="2010-01-01",
            type=str,
            help="ISO format string with the date to start gathering awards for.",
        )
        p.add_argument(
            "-e",
            "--end-date",
            dest="end_date",
            default="2023-01-01",
            type=str,
            help="ISO format string with the date to end gathering awards for.",
        )
        p.add_argument(
            "-o",
            "--outfile",
            dest="outfile",
            default=Path("./nsf-soft-search-2022.csv"),
            type=Path,
            help="The path to store the dataset CSV (and Parquet with the same name).",
        )
        p.add_argument(
            "--debug",
            dest="debug",
            action="store_true",
            help="Run with debug logging.",
        )
        p.parse_args(namespace=self)



###############################################################################



[docs]
def main() -> None:
    # Get args
    args = Args()

    # Determine log level
    if args.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO

    # Setup logging
    logging.basicConfig(
        level=log_level,
        format="[%(levelname)4s: %(module)s:%(lineno)4s %(asctime)s] %(message)s",
    )
    log = logging.getLogger(__name__)

    # Get all program chunks
    # Concat
    # Store to CSV
    try:
        # Get chunks
        program_chunks: List[pd.DataFrame] = []
        for program in tqdm(ALL_NSF_PROGRAMS, desc="Iterating major programs..."):
            log.info(f"Gathering {program} dataset chunk...")
            chunk = nsf.get_nsf_dataset(
                start_date=args.start_date,
                end_date=args.end_date,
                program_name=program,
                require_project_outcomes_doc=False,
            )
            chunk["majorProgram"] = program
            program_chunks.append(chunk)

        # Concat and report size
        awards = (
            pd.concat(program_chunks, ignore_index=True)
            .drop_duplicates(NSFFields.id_)
            .dropna(subset=[NSFFields.abstractText])
            .reset_index(drop=True)
        )
        log.info(f"Total awards found: {len(awards)}")

        # Make predictions
        log.info("Loading model and predicting software production from abstract.")
        model = load_tfidf_logit_for_prediction_from_abstract()
        awards["prediction_from_abstract"] = model.predict(
            awards[NSFFields.abstractText],
        )

        log.info("Loading model and predicting software production from outcomes.")
        model = load_tfidf_logit_for_prediction_from_outcomes()

        # Subset to just awards with outcomes
        outcomes_awards = awards.dropna(subset=[NSFFields.projectOutComesReport])
        outcomes_awards["prediction_from_outcomes"] = model.predict(
            outcomes_awards[NSFFields.projectOutComesReport],
        )
        outcomes_awards = outcomes_awards[[NSFFields.id_, "prediction_from_outcomes"]]

        # Join the outcomes predictions back to the full set
        awards = awards.join(outcomes_awards.set_index(NSFFields.id_), on=NSFFields.id_)

        # Store
        outfile = args.outfile.resolve()
        awards.to_csv(outfile, index=False)
        awards.to_parquet(outfile.with_suffix(".parquet"))
        log.info(f"Awards dataset stored to: '{outfile}'.")
    except Exception as e:
        log.error("=============================================")
        log.error("\n\n" + traceback.format_exc())
        log.error("=============================================")
        log.error("\n\n" + str(e) + "\n")
        log.error("=============================================")
        sys.exit(1)



###############################################################################
# Allow caller to directly run this module (usually in development scenarios)

if __name__ == "__main__":
    main()