Source code for soft_search.nsf

#!/usr/bin/env python

from datetime import datetime
from typing import List, Optional, Union

import pandas as pd
import requests
from tqdm import tqdm

from .constants import (
    ALL_NSF_FIELDS,
    NSF_PROGRAM_TO_CFDA_NUMBER_LUT,
    NSFFields,
    NSFPrograms,
)

###############################################################################
# Constants

_NSF_API_URL_TEMPLATE = (
    "https://api.nsf.gov/services/v1/awards.json?"
    "&agency={agency}"
    "&dateStart={start_date}"
    "&dateEnd={end_date}"
    "&cfdaNumber={cfda_number}"
    "&transType={transaction_type}"
    "&printFields={dataset_fields}"
    "&projectOutcomesOnly={require_project_outcomes}"
    "&offset={offset}"
)

###############################################################################


def _parse_nsf_datetime(dt: Union[str, datetime]) -> str:
    if isinstance(dt, str):
        # Assume "/" means MM/DD/YYYY format
        if "/" in dt:
            return dt

        # Assume "-" means isoformat
        if "-" in dt:
            dt = datetime.fromisoformat(dt)
        # Anything else, raise
        else:
            raise ValueError(
                f"Provided value to `start_date` parameter must be provided as "
                f"either MM/DD/YYYY or YYYY-MM-DD format. Received: '{dt}'"
            )

    # Should either be already formated (from "/")
    # or we had isoformat conversion or provided datetime
    return dt.strftime("%m/%d/%Y")


def _get_nsf_chunk(
    start_date: str,
    end_date: str,
    cfda_number: str,
    agency: str,
    transaction_type: str,
    dataset_fields: str,
    require_project_outcomes: str,
    offset: int,
) -> pd.DataFrame:
    # Make the request
    response = requests.get(
        _NSF_API_URL_TEMPLATE.format(
            start_date=start_date,
            end_date=end_date,
            cfda_number=cfda_number,
            agency=agency,
            transaction_type=transaction_type,
            dataset_fields=dataset_fields,
            require_project_outcomes=require_project_outcomes,
            offset=offset,
        )
    )

    # Parse and return
    response_json = response.json()["response"]
    if "award" in response_json:
        return pd.DataFrame(response_json["award"])

    return pd.DataFrame()



[docs]
def get_nsf_dataset(
    start_date: Union[str, datetime],
    end_date: Optional[Union[str, datetime]] = None,
    program_name: str = NSFPrograms.Biological_Sciences,
    agency: str = "NSF",
    transaction_type: str = "Grant",
    dataset_fields: List[str] = ALL_NSF_FIELDS,
    require_project_outcomes_doc: bool = True,
) -> pd.DataFrame:
    """
    Fetch an NSF awards dataset.
    Wraps the NSF Award Search API:
    https://www.research.gov/common/webapi/awardapisearch-v1.htm.

    Parameters
    ----------
    start_date: Union[str, datetime]
        The datetime for which awards were granted after.
        When provided as a string, "MM/DD/YYYY" and "YYYY-MM-DD" formats are accepted.
    end_date: Optional[Union[str, datetime]]
        The datetime for which awards were granted before.
        When provided as a string, "MM/DD/YYYY" and "YYYY-MM-DD" formats are accepted.
        Default: None (no end date)
    program_name: str
        The program to search for awards against.
        Default: "BIO"
    agency: str
        The funding agency.
        Default: "NSF"
    transaction_type: str
        The award type.
        Default: "Grant"
    dataset_fields: List[str]
        The fields to retrieve.
        Default: All fields available in the `soft_search.constants.NSFFields` object.
    require_project_outcomes_doc: bool
        Should only awards that have already returned project outcomes documents
        be requested.
        Default: True (request only projects with outcomes)

    Returns
    -------
    pd.DataFrame
        All awards found as a pandas DataFrame.

    Examples
    --------
    Get all grants funded by the NSF that have project outcomes under the BIO program
    from 2017 onward.

    >>> from soft_search.nsf import get_nsf_dataset
    >>> get_nsf_dataset(start_date="2017-01-01")

    Get all grants funded by the NSF that have project outcomes under the BIO program
    from 2017 onward but only return the id and abstractText fields.

    >>> from soft_search.nsf import get_nsf_dataset
    >>> from soft_search.constants import NSFFields
    >>> get_nsf_dataset(
    ...     start_date="2017-01-01",
    ...     dataset_fields=[
    ...         NSFFields.id_,
    ...         NSFFields.abstractText,
    ...     ]
    ... )

    See Also
    --------
    soft_search.constants.NSFFields
        Available dataset fields to request.
    soft_search.constants.NSFPrograms
        Available programs to request.

    Notes
    -----
    After a lot of testing, it seems like the NSF Award Search API does not
    return all results available via "Simple Search" or "Advanced Search".

    This function is safe for prototyping but for research purposes it is
    recommended to download data files from the "Advanced Search" webpage.
    """
    # Parse datetimes
    formatted_start_date = _parse_nsf_datetime(start_date)
    if end_date is None:
        end_date = datetime.utcnow()
    formatted_end_date = _parse_nsf_datetime(end_date)

    # Convert dataset fields to str
    str_dataset_fields = ",".join(dataset_fields)

    # Convert required project outcomes bool to str
    str_require_project_outcomes = str(require_project_outcomes_doc).lower()

    # Reverse lookup the cfda number from the name
    cfda_number = NSF_PROGRAM_TO_CFDA_NUMBER_LUT[program_name]

    # Run gather
    current_offset = 1
    chunks: List[pd.DataFrame] = []
    with tqdm(desc="Iterating data chunks...") as pbar:
        while True:
            # Get chunk
            chunk = _get_nsf_chunk(
                start_date=formatted_start_date,
                end_date=formatted_end_date,
                cfda_number=cfda_number,
                agency=agency,
                transaction_type=transaction_type,
                dataset_fields=str_dataset_fields,
                require_project_outcomes=str_require_project_outcomes,
                offset=current_offset,
            )
            chunks.append(chunk)

            # Check chunk length
            # The default request size for NSF is 25
            # If we received less than 25 results,
            # we can assume we are done.
            if len(chunk) < 25:
                break

            # Update state
            current_offset += 25
            pbar.update(1)

    # Concat all awards
    return (
        pd.concat(chunks, ignore_index=True)
        .drop_duplicates(NSFFields.id_)
        .reset_index(drop=True)
    )