Skip to content

Commit

Permalink
Modified script to make it more efficient and added few conditions. (#62
Browse files Browse the repository at this point in the history
)

* validate_url is set to false and timeout is set to 4 seconds which makes the output to display much faster

* black

* The retrofiles script has been cleaned and it has functions, variables and url's with respective to the files needed

* Modified the script to run more efficiently
---------

Co-authored-by: James Halgren <[email protected]>
  • Loading branch information
RohanSunkarapalli and James Halgren authored Jun 16, 2023
1 parent 6234a55 commit 1f9a6f5
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 97 deletions.
23 changes: 23 additions & 0 deletions nwm_filenames/filename_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from concurrent.futures import ThreadPoolExecutor
import requests
from functools import partial


def check_valid_urls(file_list, session=None):
if not session:
session = requests.Session()
check_url_part = partial(check_url, session)
with ThreadPoolExecutor(max_workers=10) as executor:
valid_file_list = list(executor.map(check_url_part, file_list))

return [file for file in valid_file_list if file is not None]


def check_url(session, file):
try:
with requests.get(file, stream=True, timeout=1) as response:
response.raise_for_status()
return file
response = session.head(file, timeout=1)
except requests.exceptions.RequestException:
pass
60 changes: 35 additions & 25 deletions nwm_filenames/listofnwmfilenames.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from datetime import datetime, timezone
from itertools import product

from filename_helpers import check_valid_urls

rundict = {
1: "short_range",
2: "medium_range",
Expand Down Expand Up @@ -141,8 +143,8 @@ def select_lead_time(lead_time=None, default=None):
3: "https://storage.googleapis.com/national-water-model/",
4: "https://storage.cloud.google.com/national-water-model/",
5: "gs://national-water-model/",
6: "https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/model_output/",
7: "s3://noaa-nwm-retrospective-2-1-pds/model_output/",
6: "gcs://national-water-model/",
7: "https://noaa-nwm-pds.s3.amazonaws.com/",
}


Expand All @@ -164,7 +166,7 @@ def create_file_list(
urlbaseinput=None,
lead_time=None, # TODO: change this order; placed here to avoid breaking change
):
""" for given date, run, var, fcst_cycle, and geography, print file names for the valid time (the range of fcst_hours) and dates"""
"""for given date, run, var, fcst_cycle, and geography, print file names for the valid time (the range of fcst_hours) and dates"""

runsuff = ""

Expand Down Expand Up @@ -193,7 +195,11 @@ def create_file_list(
_dtstart = today
_until = today

dates = rrule.rrule(rrule.DAILY, dtstart=_dtstart, until=_until,)
dates = rrule.rrule(
rrule.DAILY,
dtstart=_dtstart,
until=_until,
)
run_t = run_type(runinput, varinput, geoinput, run_name)
fhp = fhprefix(runinput)
vsuff = varsuffix(meminput)
Expand Down Expand Up @@ -393,36 +399,40 @@ def create_file_list(

def main():

start_date = "20220822"
end_date = "20220824"
start_date = "20030402"
end_date = "20030420"

fcst_cycle = [12, 18]
lead_time = [1, 2, 240]
fcst_cycle = [5, 12]
lead_time = [158]
# fcst_cycle = None # Retrieves a full day for each day within the range given.

runinput = 2

varinput = 1
varinput = 3

geoinput = 1

meminput = 1

urlbaseinput = None

print(
create_file_list(
runinput,
varinput,
geoinput,
meminput,
start_date,
end_date,
fcst_cycle,
urlbaseinput,
lead_time,
)
meminput = 5

urlbaseinput = 3

file_list = create_file_list(
runinput,
varinput,
geoinput,
meminput,
start_date,
end_date,
fcst_cycle,
urlbaseinput,
lead_time,
)
if len(file_list) == 0:
print(f"No files found")
else:
print(f"Files: {file_list}\nTotal files: {len(file_list)}")
valid_file_list = check_valid_urls(file_list)
print(f"Valid Files: {valid_file_list}\nValid files: {len(valid_file_list)}")


if __name__ == "__main__":
Expand Down
152 changes: 80 additions & 72 deletions nwm_filenames/listofnwmfilenamesretro.py
Original file line number Diff line number Diff line change
@@ -1,96 +1,104 @@
import requests
from dateutil import rrule
from datetime import datetime
from itertools import product
import multiprocessing
from multiprocessing.pool import Pool

retrospective_var_types = [
".CHRTOUT_DOMAIN1.comp",
".GWOUT_DOMAIN1.comp",
".LAKEOUT_DOMAIN1.comp",
".LDASOUT_DOMAIN1.comp",
".RTOUT_DOMAIN1.comp",
".LDASIN_DOMAIN1.comp",
]
objecttype = ["forcing/", "model_output/"]

def generate_url(date, file_type, urlbase_prefix, data_type):
year_txt = f"{date.strftime('%Y')}"
date_txt = f"{date.strftime('%Y%m%d%H')}"
urlbase_prefix = urlbase_prefix + objecttype[file_type - 1]

if data_type == 6:
url = f"{urlbase_prefix}{year_txt}/{date_txt}00.LDASIN_DOMAIN1"
else:
url = f"{urlbase_prefix}{year_txt}/{date_txt}00{retrospective_var_types[data_type - 1]}"
return url

from datetime import datetime, timedelta

def validate_date_range(start_date, end_date):
_dtstart = datetime.strptime(start_date, "%Y%m%d%H%M")
_until = datetime.strptime(end_date, "%Y%m%d%H%M")
return _dtstart, _until

def retrospective_archive_file_name_creator(start_date, end_date, objecttype, file_types, urlbase_prefix):
_dtstart, _until = validate_date_range(start_date, end_date)
dates = rrule.rrule(
rrule.HOURLY,
dtstart=_dtstart,
until=_until,
)
from filename_helpers import check_valid_urls

r = []
datetimes = product(dates, range(1))
for _dt, th in datetimes:
for tp in file_types:
for obj_type in objecttype:
file_name = generate_url(_dt, obj_type, urlbase_prefix, tp)
if file_name is not None:
r.append(file_name)
retrospective_var_types = {
1: ".CHRTOUT_DOMAIN1.comp",
2: ".GWOUT_DOMAIN1.comp",
3: ".LAKEOUT_DOMAIN1.comp",
4: ".LDASOUT_DOMAIN1.comp",
5: ".RTOUT_DOMAIN1.comp",
6: ".LDASIN_DOMAIN1.comp",
}

return r, len(r)
objecttypes = {1: "forcing/", 2: "model_output/"}

urlbasedict = {
6: "https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/",
7: "s3://noaa-nwm-retrospective-2-1-pds/model_output/",
}

def create_file_list_retro(start_date=None, end_date=None, fcst_cycle=None, urlbaseinput=None, file_types=[1], objecttype=None):

def generate_url(date, file_type, urlbase_prefix, retrospective_var_types=None):
year_txt = date.strftime("%Y")
date_txt = date.strftime("%Y%m%d%H")

if "forcing" in file_type and date.year < 2007:
url = f"{urlbase_prefix}{file_type}{year_txt}/{date_txt}00.LDASIN_DOMAIN1"
elif "forcing" in file_type and date.year >= 2007:
url = f"{urlbase_prefix}{file_type}{year_txt}/{date_txt}.LDASIN_DOMAIN1"
elif "model_output" in file_type:
url = [
f"{urlbase_prefix}{file_type}{year_txt}/{date_txt}00{type}"
for type in retrospective_var_types
]

return url


def create_file_list_retro(
start_date=None,
end_date=None,
urlbaseinput=None,
objecttype=objecttypes,
selected_var_types=None,
):
urlbase_prefix = urlbasedict[urlbaseinput]
objecttype = [objecttypes[i] for i in objecttype]
retrospective_var_types_selected = [
retrospective_var_types[i] for i in selected_var_types
]

start_dt = datetime.strptime(start_date, "%Y%m%d%H%M")
end_dt = datetime.strptime(end_date, "%Y%m%d%H%M")

if urlbaseinput == 6:
return retrospective_archive_file_name_creator(start_date, end_date, objecttype, file_types, urlbase_prefix)
delta = end_dt - start_dt
date_range = [
start_dt + timedelta(hours=i)
for i in range(delta.days * 24 + delta.seconds // 3600 + 1)
]

def check_url(file):
try:
response = requests.head(file, timeout=1)
if response.status_code == 200:
return file
except requests.exceptions.RequestException:
pass
file_list = []
for date in date_range:
for obj_type in objecttype:
file_names = generate_url(
date, obj_type, urlbase_prefix, retrospective_var_types_selected
)
if file_names is not None:
if isinstance(file_names, list):
file_list.extend(file_names)
else:
file_list.append(file_names)

return file_list

def check_valid_urls(file_list):
with Pool(multiprocessing.cpu_count()) as p:
valid_file_list = p.map(check_url, file_list)
return [file for file in valid_file_list if file is not None]

def main():
start_date = "19790201"
end_date = "19790202"
fcst_cycle = [12, 18]
start_date = "20070101"
end_date = "20070102"
urlbaseinput = 6
file_types = [5, 6]
objecttype = [1]
selected_var_types = [1, 2]
selected_object_types = [1] # To test both forcing and model_output
start_time = "0000"
end_time = "0800"
file_list, length = create_file_list_retro(start_date + start_time, end_date + end_time, fcst_cycle, urlbaseinput, file_types, objecttype)
if length == 0:

file_list = create_file_list_retro(
start_date + start_time,
end_date + end_time,
urlbaseinput,
selected_object_types,
selected_var_types,
)

if len(file_list) == 0:
print(f"No files found")
else:
print(f"Files: {file_list}\nTotal files: {len(file_list)}")
print(f"Files: {file_list}\nTotal Files: {len(file_list)}")

valid_file_list = check_valid_urls(file_list)

print(f"Valid Files: {valid_file_list}\nValid files: {len(valid_file_list)}")


if __name__ == "__main__":
main()
main()

0 comments on commit 1f9a6f5

Please sign in to comment.