diff --git a/nwm_filenames/filename_helpers.py b/nwm_filenames/filename_helpers.py new file mode 100644 index 0000000..2660a80 --- /dev/null +++ b/nwm_filenames/filename_helpers.py @@ -0,0 +1,23 @@ +from concurrent.futures import ThreadPoolExecutor +import requests +from functools import partial + + +def check_valid_urls(file_list, session=None): + if not session: + session = requests.Session() + check_url_part = partial(check_url, session) + with ThreadPoolExecutor(max_workers=10) as executor: + valid_file_list = list(executor.map(check_url_part, file_list)) + + return [file for file in valid_file_list if file is not None] + + +def check_url(session, file): + try: + with requests.get(file, stream=True, timeout=1) as response: + response.raise_for_status() + return file + response = session.head(file, timeout=1) + except requests.exceptions.RequestException: + pass diff --git a/nwm_filenames/listofnwmfilenames.py b/nwm_filenames/listofnwmfilenames.py index 1828985..9072164 100644 --- a/nwm_filenames/listofnwmfilenames.py +++ b/nwm_filenames/listofnwmfilenames.py @@ -2,6 +2,8 @@ from datetime import datetime, timezone from itertools import product +from filename_helpers import check_valid_urls + rundict = { 1: "short_range", 2: "medium_range", @@ -141,8 +143,8 @@ def select_lead_time(lead_time=None, default=None): 3: "https://storage.googleapis.com/national-water-model/", 4: "https://storage.cloud.google.com/national-water-model/", 5: "gs://national-water-model/", - 6: "https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/", - 7: "s3://noaa-nwm-retrospective-2-1-pds/model_output/", + 6: "gcs://national-water-model/", + 7: "https://noaa-nwm-pds.s3.amazonaws.com/", } @@ -164,7 +166,7 @@ def create_file_list( urlbaseinput=None, lead_time=None, # TODO: change this order; placed here to avoid breaking change ): - """ for given date, run, var, fcst_cycle, and geography, print file names for the valid time (the range of fcst_hours) and dates""" + """for given date, run, var, fcst_cycle, and geography, print file names for the valid time (the range of fcst_hours) and dates""" runsuff = "" @@ -193,7 +195,11 @@ def create_file_list( _dtstart = today _until = today - dates = rrule.rrule(rrule.DAILY, dtstart=_dtstart, until=_until,) + dates = rrule.rrule( + rrule.DAILY, + dtstart=_dtstart, + until=_until, + ) run_t = run_type(runinput, varinput, geoinput, run_name) fhp = fhprefix(runinput) vsuff = varsuffix(meminput) @@ -393,36 +399,40 @@ def create_file_list( def main(): - start_date = "20220822" - end_date = "20220824" + start_date = "20030402" + end_date = "20030420" - fcst_cycle = [12, 18] - lead_time = [1, 2, 240] + fcst_cycle = [5, 12] + lead_time = [158] # fcst_cycle = None # Retrieves a full day for each day within the range given. runinput = 2 - varinput = 1 + varinput = 3 geoinput = 1 - meminput = 1 - - urlbaseinput = None - - print( - create_file_list( - runinput, - varinput, - geoinput, - meminput, - start_date, - end_date, - fcst_cycle, - urlbaseinput, - lead_time, - ) + meminput = 5 + + urlbaseinput = 3 + + file_list = create_file_list( + runinput, + varinput, + geoinput, + meminput, + start_date, + end_date, + fcst_cycle, + urlbaseinput, + lead_time, ) + if len(file_list) == 0: + print(f"No files found") + else: + print(f"Files: {file_list}\nTotal files: {len(file_list)}") + valid_file_list = check_valid_urls(file_list) + print(f"Valid Files: {valid_file_list}\nValid files: {len(valid_file_list)}") if __name__ == "__main__": diff --git a/nwm_filenames/listofnwmfilenamesretro.py b/nwm_filenames/listofnwmfilenamesretro.py index 04967a3..5e4d5e4 100644 --- a/nwm_filenames/listofnwmfilenamesretro.py +++ b/nwm_filenames/listofnwmfilenamesretro.py @@ -1,96 +1,104 @@ -import requests -from dateutil import rrule -from datetime import datetime -from itertools import product -import multiprocessing -from multiprocessing.pool import Pool - -retrospective_var_types = [ - ".CHRTOUT_DOMAIN1.comp", - ".GWOUT_DOMAIN1.comp", - ".LAKEOUT_DOMAIN1.comp", - ".LDASOUT_DOMAIN1.comp", - ".RTOUT_DOMAIN1.comp", - ".LDASIN_DOMAIN1.comp", -] -objecttype = ["forcing/", "model_output/"] - -def generate_url(date, file_type, urlbase_prefix, data_type): - year_txt = f"{date.strftime('%Y')}" - date_txt = f"{date.strftime('%Y%m%d%H')}" - urlbase_prefix = urlbase_prefix + objecttype[file_type - 1] - - if data_type == 6: - url = f"{urlbase_prefix}{year_txt}/{date_txt}00.LDASIN_DOMAIN1" - else: - url = f"{urlbase_prefix}{year_txt}/{date_txt}00{retrospective_var_types[data_type - 1]}" - return url - +from datetime import datetime, timedelta -def validate_date_range(start_date, end_date): - _dtstart = datetime.strptime(start_date, "%Y%m%d%H%M") - _until = datetime.strptime(end_date, "%Y%m%d%H%M") - return _dtstart, _until - -def retrospective_archive_file_name_creator(start_date, end_date, objecttype, file_types, urlbase_prefix): - _dtstart, _until = validate_date_range(start_date, end_date) - dates = rrule.rrule( - rrule.HOURLY, - dtstart=_dtstart, - until=_until, - ) +from filename_helpers import check_valid_urls - r = [] - datetimes = product(dates, range(1)) - for _dt, th in datetimes: - for tp in file_types: - for obj_type in objecttype: - file_name = generate_url(_dt, obj_type, urlbase_prefix, tp) - if file_name is not None: - r.append(file_name) +retrospective_var_types = { + 1: ".CHRTOUT_DOMAIN1.comp", + 2: ".GWOUT_DOMAIN1.comp", + 3: ".LAKEOUT_DOMAIN1.comp", + 4: ".LDASOUT_DOMAIN1.comp", + 5: ".RTOUT_DOMAIN1.comp", + 6: ".LDASIN_DOMAIN1.comp", +} - return r, len(r) +objecttypes = {1: "forcing/", 2: "model_output/"} urlbasedict = { 6: "https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/", 7: "s3://noaa-nwm-retrospective-2-1-pds/model_output/", } -def create_file_list_retro(start_date=None, end_date=None, fcst_cycle=None, urlbaseinput=None, file_types=[1], objecttype=None): + +def generate_url(date, file_type, urlbase_prefix, retrospective_var_types=None): + year_txt = date.strftime("%Y") + date_txt = date.strftime("%Y%m%d%H") + + if "forcing" in file_type and date.year < 2007: + url = f"{urlbase_prefix}{file_type}{year_txt}/{date_txt}00.LDASIN_DOMAIN1" + elif "forcing" in file_type and date.year >= 2007: + url = f"{urlbase_prefix}{file_type}{year_txt}/{date_txt}.LDASIN_DOMAIN1" + elif "model_output" in file_type: + url = [ + f"{urlbase_prefix}{file_type}{year_txt}/{date_txt}00{type}" + for type in retrospective_var_types + ] + + return url + + +def create_file_list_retro( + start_date=None, + end_date=None, + urlbaseinput=None, + objecttype=objecttypes, + selected_var_types=None, +): urlbase_prefix = urlbasedict[urlbaseinput] + objecttype = [objecttypes[i] for i in objecttype] + retrospective_var_types_selected = [ + retrospective_var_types[i] for i in selected_var_types + ] + + start_dt = datetime.strptime(start_date, "%Y%m%d%H%M") + end_dt = datetime.strptime(end_date, "%Y%m%d%H%M") - if urlbaseinput == 6: - return retrospective_archive_file_name_creator(start_date, end_date, objecttype, file_types, urlbase_prefix) + delta = end_dt - start_dt + date_range = [ + start_dt + timedelta(hours=i) + for i in range(delta.days * 24 + delta.seconds // 3600 + 1) + ] -def check_url(file): - try: - response = requests.head(file, timeout=1) - if response.status_code == 200: - return file - except requests.exceptions.RequestException: - pass + file_list = [] + for date in date_range: + for obj_type in objecttype: + file_names = generate_url( + date, obj_type, urlbase_prefix, retrospective_var_types_selected + ) + if file_names is not None: + if isinstance(file_names, list): + file_list.extend(file_names) + else: + file_list.append(file_names) + + return file_list -def check_valid_urls(file_list): - with Pool(multiprocessing.cpu_count()) as p: - valid_file_list = p.map(check_url, file_list) - return [file for file in valid_file_list if file is not None] def main(): - start_date = "19790201" - end_date = "19790202" - fcst_cycle = [12, 18] + start_date = "20070101" + end_date = "20070102" urlbaseinput = 6 - file_types = [5, 6] - objecttype = [1] + selected_var_types = [1, 2] + selected_object_types = [1] # To test both forcing and model_output start_time = "0000" end_time = "0800" - file_list, length = create_file_list_retro(start_date + start_time, end_date + end_time, fcst_cycle, urlbaseinput, file_types, objecttype) - if length == 0: + + file_list = create_file_list_retro( + start_date + start_time, + end_date + end_time, + urlbaseinput, + selected_object_types, + selected_var_types, + ) + + if len(file_list) == 0: print(f"No files found") else: - print(f"Files: {file_list}\nTotal files: {len(file_list)}") + print(f"Files: {file_list}\nTotal Files: {len(file_list)}") + valid_file_list = check_valid_urls(file_list) + print(f"Valid Files: {valid_file_list}\nValid files: {len(valid_file_list)}") + if __name__ == "__main__": - main() \ No newline at end of file + main()