-
Notifications
You must be signed in to change notification settings - Fork 60
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Modified script to make it more efficient and added few conditions. (#62
) * validate_url is set to false and timeout is set to 4 seconds which makes the output to display much faster * black * The retrofiles script has been cleaned and it has functions, variables and url's with respective to the files needed * Modified the script to run more efficiently --------- Co-authored-by: James Halgren <[email protected]>
- Loading branch information
1 parent
6234a55
commit 1f9a6f5
Showing
3 changed files
with
138 additions
and
97 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from concurrent.futures import ThreadPoolExecutor | ||
import requests | ||
from functools import partial | ||
|
||
|
||
def check_valid_urls(file_list, session=None): | ||
if not session: | ||
session = requests.Session() | ||
check_url_part = partial(check_url, session) | ||
with ThreadPoolExecutor(max_workers=10) as executor: | ||
valid_file_list = list(executor.map(check_url_part, file_list)) | ||
|
||
return [file for file in valid_file_list if file is not None] | ||
|
||
|
||
def check_url(session, file): | ||
try: | ||
with requests.get(file, stream=True, timeout=1) as response: | ||
response.raise_for_status() | ||
return file | ||
response = session.head(file, timeout=1) | ||
except requests.exceptions.RequestException: | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,96 +1,104 @@ | ||
import requests | ||
from dateutil import rrule | ||
from datetime import datetime | ||
from itertools import product | ||
import multiprocessing | ||
from multiprocessing.pool import Pool | ||
|
||
retrospective_var_types = [ | ||
".CHRTOUT_DOMAIN1.comp", | ||
".GWOUT_DOMAIN1.comp", | ||
".LAKEOUT_DOMAIN1.comp", | ||
".LDASOUT_DOMAIN1.comp", | ||
".RTOUT_DOMAIN1.comp", | ||
".LDASIN_DOMAIN1.comp", | ||
] | ||
objecttype = ["forcing/", "model_output/"] | ||
|
||
def generate_url(date, file_type, urlbase_prefix, data_type): | ||
year_txt = f"{date.strftime('%Y')}" | ||
date_txt = f"{date.strftime('%Y%m%d%H')}" | ||
urlbase_prefix = urlbase_prefix + objecttype[file_type - 1] | ||
|
||
if data_type == 6: | ||
url = f"{urlbase_prefix}{year_txt}/{date_txt}00.LDASIN_DOMAIN1" | ||
else: | ||
url = f"{urlbase_prefix}{year_txt}/{date_txt}00{retrospective_var_types[data_type - 1]}" | ||
return url | ||
|
||
from datetime import datetime, timedelta | ||
|
||
def validate_date_range(start_date, end_date): | ||
_dtstart = datetime.strptime(start_date, "%Y%m%d%H%M") | ||
_until = datetime.strptime(end_date, "%Y%m%d%H%M") | ||
return _dtstart, _until | ||
|
||
def retrospective_archive_file_name_creator(start_date, end_date, objecttype, file_types, urlbase_prefix): | ||
_dtstart, _until = validate_date_range(start_date, end_date) | ||
dates = rrule.rrule( | ||
rrule.HOURLY, | ||
dtstart=_dtstart, | ||
until=_until, | ||
) | ||
from filename_helpers import check_valid_urls | ||
|
||
r = [] | ||
datetimes = product(dates, range(1)) | ||
for _dt, th in datetimes: | ||
for tp in file_types: | ||
for obj_type in objecttype: | ||
file_name = generate_url(_dt, obj_type, urlbase_prefix, tp) | ||
if file_name is not None: | ||
r.append(file_name) | ||
retrospective_var_types = { | ||
1: ".CHRTOUT_DOMAIN1.comp", | ||
2: ".GWOUT_DOMAIN1.comp", | ||
3: ".LAKEOUT_DOMAIN1.comp", | ||
4: ".LDASOUT_DOMAIN1.comp", | ||
5: ".RTOUT_DOMAIN1.comp", | ||
6: ".LDASIN_DOMAIN1.comp", | ||
} | ||
|
||
return r, len(r) | ||
objecttypes = {1: "forcing/", 2: "model_output/"} | ||
|
||
urlbasedict = { | ||
6: "https://noaa-nwm-retrospective-2-1-pds.s3.amazonaws.com/", | ||
7: "s3://noaa-nwm-retrospective-2-1-pds/model_output/", | ||
} | ||
|
||
def create_file_list_retro(start_date=None, end_date=None, fcst_cycle=None, urlbaseinput=None, file_types=[1], objecttype=None): | ||
|
||
def generate_url(date, file_type, urlbase_prefix, retrospective_var_types=None): | ||
year_txt = date.strftime("%Y") | ||
date_txt = date.strftime("%Y%m%d%H") | ||
|
||
if "forcing" in file_type and date.year < 2007: | ||
url = f"{urlbase_prefix}{file_type}{year_txt}/{date_txt}00.LDASIN_DOMAIN1" | ||
elif "forcing" in file_type and date.year >= 2007: | ||
url = f"{urlbase_prefix}{file_type}{year_txt}/{date_txt}.LDASIN_DOMAIN1" | ||
elif "model_output" in file_type: | ||
url = [ | ||
f"{urlbase_prefix}{file_type}{year_txt}/{date_txt}00{type}" | ||
for type in retrospective_var_types | ||
] | ||
|
||
return url | ||
|
||
|
||
def create_file_list_retro( | ||
start_date=None, | ||
end_date=None, | ||
urlbaseinput=None, | ||
objecttype=objecttypes, | ||
selected_var_types=None, | ||
): | ||
urlbase_prefix = urlbasedict[urlbaseinput] | ||
objecttype = [objecttypes[i] for i in objecttype] | ||
retrospective_var_types_selected = [ | ||
retrospective_var_types[i] for i in selected_var_types | ||
] | ||
|
||
start_dt = datetime.strptime(start_date, "%Y%m%d%H%M") | ||
end_dt = datetime.strptime(end_date, "%Y%m%d%H%M") | ||
|
||
if urlbaseinput == 6: | ||
return retrospective_archive_file_name_creator(start_date, end_date, objecttype, file_types, urlbase_prefix) | ||
delta = end_dt - start_dt | ||
date_range = [ | ||
start_dt + timedelta(hours=i) | ||
for i in range(delta.days * 24 + delta.seconds // 3600 + 1) | ||
] | ||
|
||
def check_url(file): | ||
try: | ||
response = requests.head(file, timeout=1) | ||
if response.status_code == 200: | ||
return file | ||
except requests.exceptions.RequestException: | ||
pass | ||
file_list = [] | ||
for date in date_range: | ||
for obj_type in objecttype: | ||
file_names = generate_url( | ||
date, obj_type, urlbase_prefix, retrospective_var_types_selected | ||
) | ||
if file_names is not None: | ||
if isinstance(file_names, list): | ||
file_list.extend(file_names) | ||
else: | ||
file_list.append(file_names) | ||
|
||
return file_list | ||
|
||
def check_valid_urls(file_list): | ||
with Pool(multiprocessing.cpu_count()) as p: | ||
valid_file_list = p.map(check_url, file_list) | ||
return [file for file in valid_file_list if file is not None] | ||
|
||
def main(): | ||
start_date = "19790201" | ||
end_date = "19790202" | ||
fcst_cycle = [12, 18] | ||
start_date = "20070101" | ||
end_date = "20070102" | ||
urlbaseinput = 6 | ||
file_types = [5, 6] | ||
objecttype = [1] | ||
selected_var_types = [1, 2] | ||
selected_object_types = [1] # To test both forcing and model_output | ||
start_time = "0000" | ||
end_time = "0800" | ||
file_list, length = create_file_list_retro(start_date + start_time, end_date + end_time, fcst_cycle, urlbaseinput, file_types, objecttype) | ||
if length == 0: | ||
|
||
file_list = create_file_list_retro( | ||
start_date + start_time, | ||
end_date + end_time, | ||
urlbaseinput, | ||
selected_object_types, | ||
selected_var_types, | ||
) | ||
|
||
if len(file_list) == 0: | ||
print(f"No files found") | ||
else: | ||
print(f"Files: {file_list}\nTotal files: {len(file_list)}") | ||
print(f"Files: {file_list}\nTotal Files: {len(file_list)}") | ||
|
||
valid_file_list = check_valid_urls(file_list) | ||
|
||
print(f"Valid Files: {valid_file_list}\nValid files: {len(valid_file_list)}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() | ||
main() |