Source code for soft_search.bin.get_github_repositories_with_nsf_ref

#!/usr/bin/env python

import argparse
import logging
import shutil
import sys
import time
import traceback
from datetime import datetime, timedelta
from pathlib import Path

import pandas as pd
from dotenv import load_dotenv
from fastcore.net import HTTP4xxClientError
from ghapi.all import GhApi

###############################################################################

load_dotenv()

###############################################################################

SEARCH_QUERIES_START_PAGE = {
    "National Science Foundation": 0,
    "NSF Award": 0,
    "NSF Grant": 0,
    "Supported by the NSF": 0,
    "Supported by NSF": 0,
}

BATCH_SIZE = 10

###############################################################################



[docs]
class Args(argparse.Namespace):
    def __init__(self) -> None:
        self.__parse()

    def __parse(self) -> None:
        p = argparse.ArgumentParser(
            prog="get-github-repositories-with-nsf-ref",
            description=("Search for GitHub repositories which reference NSF Awards."),
        )
        p.add_argument(
            "-o",
            "--outdir",
            dest="outdir",
            default=Path("gh-search-results/"),
            type=Path,
            help=(
                "The path to store all paginated results. "
                "Default: gh-search-results/"
            ),
        )
        p.add_argument(
            "-c",
            "--clean",
            dest="clean",
            default=True,
            type=bool,
            help=(
                "Before running the data gathering process, "
                "should any existing outdir be cleaned of existing files. "
                "Default: True (clean existing files)"
            ),
        )
        p.add_argument(
            "-t",
            "--token",
            dest="token",
            default=None,
            type=str,
            help=(
                "GitHub Personal Access Token to use for requests. "
                "If none provided, attempts load from `.env` file. "
                "If none found, uses no-auth requests which will take longer. "
                "Default: None (use .env)"
            ),
        )
        p.add_argument(
            "--debug",
            dest="debug",
            action="store_true",
            help="Run with debug logging.",
        )
        p.parse_args(namespace=self)



###############################################################################



[docs]
def main() -> None:  # noqa: C901
    # Get args
    args = Args()

    # Determine log level
    if args.debug:
        log_level = logging.DEBUG
    else:
        log_level = logging.INFO

    # Setup logging
    logging.basicConfig(
        level=log_level,
        format="[%(levelname)4s: %(module)s:%(lineno)4s %(asctime)s] %(message)s",
    )
    log = logging.getLogger(__name__)

    try:
        # Determine token / api
        if args.token is None:
            load_dotenv()
            api = GhApi()
        else:
            api = GhApi(token=args.token)

        # Clean
        if args.clean:
            if args.outdir.exists():
                shutil.rmtree(args.outdir)

        # Make dir if needed
        args.outdir.mkdir(parents=True, exist_ok=True)

        # Get all results for each term
        query_start_time = time.time()
        for query, page in SEARCH_QUERIES_START_PAGE.items():
            log.info(f"Beginning page requests for: '{query}'")
            complete_query = f'"{query}" filename:README.md'

            # Get initial
            all_gathered = False
            while not all_gathered:
                try:
                    log.debug(f"Querying: '{complete_query}', Page: {page}")
                    page_results = api(
                        "/search/code",
                        "GET ",
                        query={
                            "q": complete_query,
                            "per_page": BATCH_SIZE,
                            "page": page,
                        },
                    )
                    total_count = page_results["total_count"]
                    real_count = total_count if total_count < 1000 else 1000
                    items_returned = page_results["items"]

                    # Unpack results
                    results = []
                    for item in items_returned:
                        repo_details = item["repository"]
                        repo_name = repo_details["name"]
                        owner_details = repo_details["owner"]
                        owner_name = owner_details["login"]
                        full_name = f"{owner_name}/{repo_name}"

                        # Get languages
                        languages = api(f"/repos/{full_name}/languages")

                        # Get latest commit datetime
                        commits = api(f"/repos/{full_name}/commits")
                        most_recent_commit = commits[0]["commit"]
                        most_recent_committer = most_recent_commit["committer"]
                        most_recent_committer_name = most_recent_committer["name"]
                        most_recent_committer_email = most_recent_committer["email"]
                        most_recent_commit_dt = datetime.fromisoformat(
                            # We remove last character because it is 'Z' for "Zulu"
                            # Datetimes are naturally UTC/Zulu
                            most_recent_committer["date"][:-1]
                        )

                        # Append this result to all results
                        results.append(
                            {
                                "owner": owner_name,
                                "name": repo_name,
                                "link": f"https://github.com/{full_name}",
                                "languages": "; ".join(languages.keys()),
                                "most_recent_committer_name": (
                                    most_recent_committer_name
                                ),
                                "most_recent_committer_email": (
                                    most_recent_committer_email
                                ),
                                "most_recent_commit_datetime": (
                                    most_recent_commit_dt.isoformat()
                                ),
                                "most_recent_commit_timestamp": (
                                    most_recent_commit_dt.timestamp()
                                ),
                                "query": query,
                            }
                        )

                    # Store partial results
                    if len(results) != 0:
                        save_name = f"{query.lower().replace(' ', '_')}-page_{page}.csv"
                        pd.DataFrame(results).to_csv(
                            args.outdir / save_name,
                            index=False,
                        )

                    # Increase page and keep going
                    page += 1

                    # Wait to avoid rate limiting
                    log.debug("Sleeping for one minute...")
                    time.sleep(60)

                    # Update time estimate
                    batch_time = time.time()
                    seconds_diff = batch_time - query_start_time
                    seconds_diff_per_page = seconds_diff / page
                    total_pages_required = real_count / BATCH_SIZE
                    remaining_pages = total_pages_required - page
                    estimated_remaining_seconds = (
                        seconds_diff_per_page * remaining_pages
                    )
                    estimated_remained_pages = remaining_pages
                    log.info(
                        f"Remaining pages: {estimated_remained_pages} "
                        f"(of {total_pages_required} -- "
                        f"est. {timedelta(seconds=estimated_remaining_seconds)})"
                    )

                    # Break because we are done
                    # Stop at 1000 results because GitHub limits search
                    # https://github.com/PyGithub/PyGithub/issues/1072#issuecomment-499211486
                    if len(items_returned) == 0 or page * BATCH_SIZE >= 1000:
                        log.info("Reached GitHub max search results.")
                        break

                except HTTP4xxClientError as e:
                    log.error(f"Caught exception: {e}")

    except Exception as e:
        log.error("=============================================")
        log.error("\n\n" + traceback.format_exc())
        log.error("=============================================")
        log.error("\n\n" + str(e) + "\n")
        log.error("=============================================")
        sys.exit(1)



###############################################################################
# Allow caller to directly run this module (usually in development scenarios)

if __name__ == "__main__":
    main()