Source code for soft_search.nsf

#!/usr/bin/env python

from datetime import datetime
from typing import List, Optional, Union

import pandas as pd
import requests
from tqdm import tqdm

from .constants import (
    ALL_NSF_FIELDS,
    NSF_PROGRAM_TO_CFDA_NUMBER_LUT,
    NSFFields,
    NSFPrograms,
)

###############################################################################
# Constants

_NSF_API_URL_TEMPLATE = (
    "https://api.nsf.gov/services/v1/awards.json?"
    "&agency={agency}"
    "&dateStart={start_date}"
    "&dateEnd={end_date}"
    "&cfdaNumber={cfda_number}"
    "&transType={transaction_type}"
    "&printFields={dataset_fields}"
    "&projectOutcomesOnly={require_project_outcomes}"
    "&offset={offset}"
)

###############################################################################


def _parse_nsf_datetime(dt: Union[str, datetime]) -> str:
    if isinstance(dt, str):
        # Assume "/" means MM/DD/YYYY format
        if "/" in dt:
            return dt

        # Assume "-" means isoformat
        if "-" in dt:
            dt = datetime.fromisoformat(dt)
        # Anything else, raise
        else:
            raise ValueError(
                f"Provided value to `start_date` parameter must be provided as "
                f"either MM/DD/YYYY or YYYY-MM-DD format. Received: '{dt}'"
            )

    # Should either be already formated (from "/")
    # or we had isoformat conversion or provided datetime
    return dt.strftime("%m/%d/%Y")


def _get_nsf_chunk(
    start_date: str,
    end_date: str,
    cfda_number: str,
    agency: str,
    transaction_type: str,
    dataset_fields: str,
    require_project_outcomes: str,
    offset: int,
) -> pd.DataFrame:
    # Make the request
    response = requests.get(
        _NSF_API_URL_TEMPLATE.format(
            start_date=start_date,
            end_date=end_date,
            cfda_number=cfda_number,
            agency=agency,
            transaction_type=transaction_type,
            dataset_fields=dataset_fields,
            require_project_outcomes=require_project_outcomes,
            offset=offset,
        )
    )

    # Parse and return
    response_json = response.json()["response"]
    if "award" in response_json:
        return pd.DataFrame(response_json["award"])

    return pd.DataFrame()


[docs] def get_nsf_dataset( start_date: Union[str, datetime], end_date: Optional[Union[str, datetime]] = None, program_name: str = NSFPrograms.Biological_Sciences, agency: str = "NSF", transaction_type: str = "Grant", dataset_fields: List[str] = ALL_NSF_FIELDS, require_project_outcomes_doc: bool = True, ) -> pd.DataFrame: """ Fetch an NSF awards dataset. Wraps the NSF Award Search API: https://www.research.gov/common/webapi/awardapisearch-v1.htm. Parameters ---------- start_date: Union[str, datetime] The datetime for which awards were granted after. When provided as a string, "MM/DD/YYYY" and "YYYY-MM-DD" formats are accepted. end_date: Optional[Union[str, datetime]] The datetime for which awards were granted before. When provided as a string, "MM/DD/YYYY" and "YYYY-MM-DD" formats are accepted. Default: None (no end date) program_name: str The program to search for awards against. Default: "BIO" agency: str The funding agency. Default: "NSF" transaction_type: str The award type. Default: "Grant" dataset_fields: List[str] The fields to retrieve. Default: All fields available in the `soft_search.constants.NSFFields` object. require_project_outcomes_doc: bool Should only awards that have already returned project outcomes documents be requested. Default: True (request only projects with outcomes) Returns ------- pd.DataFrame All awards found as a pandas DataFrame. Examples -------- Get all grants funded by the NSF that have project outcomes under the BIO program from 2017 onward. >>> from soft_search.nsf import get_nsf_dataset >>> get_nsf_dataset(start_date="2017-01-01") Get all grants funded by the NSF that have project outcomes under the BIO program from 2017 onward but only return the id and abstractText fields. >>> from soft_search.nsf import get_nsf_dataset >>> from soft_search.constants import NSFFields >>> get_nsf_dataset( ... start_date="2017-01-01", ... dataset_fields=[ ... NSFFields.id_, ... NSFFields.abstractText, ... ] ... ) See Also -------- soft_search.constants.NSFFields Available dataset fields to request. soft_search.constants.NSFPrograms Available programs to request. Notes ----- After a lot of testing, it seems like the NSF Award Search API does not return all results available via "Simple Search" or "Advanced Search". This function is safe for prototyping but for research purposes it is recommended to download data files from the "Advanced Search" webpage. """ # Parse datetimes formatted_start_date = _parse_nsf_datetime(start_date) if end_date is None: end_date = datetime.utcnow() formatted_end_date = _parse_nsf_datetime(end_date) # Convert dataset fields to str str_dataset_fields = ",".join(dataset_fields) # Convert required project outcomes bool to str str_require_project_outcomes = str(require_project_outcomes_doc).lower() # Reverse lookup the cfda number from the name cfda_number = NSF_PROGRAM_TO_CFDA_NUMBER_LUT[program_name] # Run gather current_offset = 1 chunks: List[pd.DataFrame] = [] with tqdm(desc="Iterating data chunks...") as pbar: while True: # Get chunk chunk = _get_nsf_chunk( start_date=formatted_start_date, end_date=formatted_end_date, cfda_number=cfda_number, agency=agency, transaction_type=transaction_type, dataset_fields=str_dataset_fields, require_project_outcomes=str_require_project_outcomes, offset=current_offset, ) chunks.append(chunk) # Check chunk length # The default request size for NSF is 25 # If we received less than 25 results, # we can assume we are done. if len(chunk) < 25: break # Update state current_offset += 25 pbar.update(1) # Concat all awards return ( pd.concat(chunks, ignore_index=True) .drop_duplicates(NSFFields.id_) .reset_index(drop=True) )