Source code for soft_search.bin.get_github_repositories_with_nsf_ref
#!/usr/bin/env pythonimportargparseimportloggingimportshutilimportsysimporttimeimporttracebackfromdatetimeimportdatetime,timedeltafrompathlibimportPathimportpandasaspdfromdotenvimportload_dotenvfromfastcore.netimportHTTP4xxClientErrorfromghapi.allimportGhApi###############################################################################load_dotenv()###############################################################################SEARCH_QUERIES_START_PAGE={"National Science Foundation":0,"NSF Award":0,"NSF Grant":0,"Supported by the NSF":0,"Supported by NSF":0,}BATCH_SIZE=10###############################################################################
[docs]classArgs(argparse.Namespace):def__init__(self)->None:self.__parse()def__parse(self)->None:p=argparse.ArgumentParser(prog="get-github-repositories-with-nsf-ref",description=("Search for GitHub repositories which reference NSF Awards."),)p.add_argument("-o","--outdir",dest="outdir",default=Path("gh-search-results/"),type=Path,help=("The path to store all paginated results. ""Default: gh-search-results/"),)p.add_argument("-c","--clean",dest="clean",default=True,type=bool,help=("Before running the data gathering process, ""should any existing outdir be cleaned of existing files. ""Default: True (clean existing files)"),)p.add_argument("-t","--token",dest="token",default=None,type=str,help=("GitHub Personal Access Token to use for requests. ""If none provided, attempts load from `.env` file. ""If none found, uses no-auth requests which will take longer. ""Default: None (use .env)"),)p.add_argument("--debug",dest="debug",action="store_true",help="Run with debug logging.",)p.parse_args(namespace=self)
[docs]defmain()->None:# noqa: C901# Get argsargs=Args()# Determine log levelifargs.debug:log_level=logging.DEBUGelse:log_level=logging.INFO# Setup logginglogging.basicConfig(level=log_level,format="[%(levelname)4s: %(module)s:%(lineno)4s%(asctime)s] %(message)s",)log=logging.getLogger(__name__)try:# Determine token / apiifargs.tokenisNone:load_dotenv()api=GhApi()else:api=GhApi(token=args.token)# Cleanifargs.clean:ifargs.outdir.exists():shutil.rmtree(args.outdir)# Make dir if neededargs.outdir.mkdir(parents=True,exist_ok=True)# Get all results for each termquery_start_time=time.time()forquery,pageinSEARCH_QUERIES_START_PAGE.items():log.info(f"Beginning page requests for: '{query}'")complete_query=f'"{query}" filename:README.md'# Get initialall_gathered=Falsewhilenotall_gathered:try:log.debug(f"Querying: '{complete_query}', Page: {page}")page_results=api("/search/code","GET ",query={"q":complete_query,"per_page":BATCH_SIZE,"page":page,},)total_count=page_results["total_count"]real_count=total_countiftotal_count<1000else1000items_returned=page_results["items"]# Unpack resultsresults=[]foriteminitems_returned:repo_details=item["repository"]repo_name=repo_details["name"]owner_details=repo_details["owner"]owner_name=owner_details["login"]full_name=f"{owner_name}/{repo_name}"# Get languageslanguages=api(f"/repos/{full_name}/languages")# Get latest commit datetimecommits=api(f"/repos/{full_name}/commits")most_recent_commit=commits[0]["commit"]most_recent_committer=most_recent_commit["committer"]most_recent_committer_name=most_recent_committer["name"]most_recent_committer_email=most_recent_committer["email"]most_recent_commit_dt=datetime.fromisoformat(# We remove last character because it is 'Z' for "Zulu"# Datetimes are naturally UTC/Zulumost_recent_committer["date"][:-1])# Append this result to all resultsresults.append({"owner":owner_name,"name":repo_name,"link":f"https://github.com/{full_name}","languages":"; ".join(languages.keys()),"most_recent_committer_name":(most_recent_committer_name),"most_recent_committer_email":(most_recent_committer_email),"most_recent_commit_datetime":(most_recent_commit_dt.isoformat()),"most_recent_commit_timestamp":(most_recent_commit_dt.timestamp()),"query":query,})# Store partial resultsiflen(results)!=0:save_name=f"{query.lower().replace(' ','_')}-page_{page}.csv"pd.DataFrame(results).to_csv(args.outdir/save_name,index=False,)# Increase page and keep goingpage+=1# Wait to avoid rate limitinglog.debug("Sleeping for one minute...")time.sleep(60)# Update time estimatebatch_time=time.time()seconds_diff=batch_time-query_start_timeseconds_diff_per_page=seconds_diff/pagetotal_pages_required=real_count/BATCH_SIZEremaining_pages=total_pages_required-pageestimated_remaining_seconds=(seconds_diff_per_page*remaining_pages)estimated_remained_pages=remaining_pageslog.info(f"Remaining pages: {estimated_remained_pages} "f"(of {total_pages_required} -- "f"est. {timedelta(seconds=estimated_remaining_seconds)})")# Break because we are done# Stop at 1000 results because GitHub limits search# https://github.com/PyGithub/PyGithub/issues/1072#issuecomment-499211486iflen(items_returned)==0orpage*BATCH_SIZE>=1000:log.info("Reached GitHub max search results.")breakexceptHTTP4xxClientErrorase:log.error(f"Caught exception: {e}")exceptExceptionase:log.error("=============================================")log.error("\n\n"+traceback.format_exc())log.error("=============================================")log.error("\n\n"+str(e)+"\n")log.error("=============================================")sys.exit(1)
################################################################################ Allow caller to directly run this module (usually in development scenarios)if__name__=="__main__":main()