Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed directory structure for S3 bucket #2448

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions doajtest/unit/test_bll_site_sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,11 +125,11 @@ def test_sitemap(self, name, kwargs):
articles_expectations = [(a.id, a.last_updated) for a in articles]

if prune:
self.localStore.store(self.container_id, "sitemap_doaj_20180101_0000_utf8.xml",
self.localStore.store(self.container_id, "sitemap_doaj_20180101_0000/_0_utf8.xml",
source_stream=StringIO("test1"))
self.localStore.store(self.container_id, "sitemap_doaj_20180601_0000_utf8.xml",
self.localStore.store(self.container_id, "sitemap_doaj_20180601_0000/_0_utf8.xml",
source_stream=StringIO("test2"))
self.localStore.store(self.container_id, "sitemap_doaj_20190101_0000_utf8.xml",
self.localStore.store(self.container_id, "sitemap_doaj_20190101_0000/_0_utf8.xml",
source_stream=StringIO("test3"))

###########################################################
Expand All @@ -153,22 +153,22 @@ def test_sitemap(self, name, kwargs):
filenames = self.localStore.list(self.container_id)
if prune:
assert len(filenames) == 2, "expected 0, received {}".format(len(filenames))
assert "sitemap_doaj_20180101_0000_utf8.xml" not in filenames
assert "sitemap_doaj_20180601_0000_utf8.xml" not in filenames
assert "sitemap_doaj_20190101_0000_utf8.xml" in filenames
assert "sitemap_doaj_20180101_0000" not in filenames
assert "sitemap_doaj_20180601_0000" not in filenames
assert "sitemap_doaj_20190101_0000" in filenames
else:
assert len(filenames) == 1, "expected 0, received {}".format(len(filenames))

latest = None
for fn in filenames:
if fn != "sitemap_doaj_20190101_0000_utf8.xml":
if fn != "sitemap_doaj_20190101_0000":
latest = fn
break

NS = "{http://www.sitemaps.org/schemas/sitemap/0.9}"

file_date = '_'.join(latest.split('_')[2:])
index_file = os.path.join(latest, 'sitemap_index_doaj_'+file_date+'_utf8.xml')
index_file = os.path.join(latest, 'sitemap_index_utf8.xml')

handle = self.localStore.get(self.container_id, index_file, encoding="utf-8")

Expand All @@ -184,7 +184,7 @@ def test_sitemap(self, name, kwargs):
article_ids = []

# check sitemap file
sitemap_file = os.path.join(latest, 'sitemap_doaj_' + file_date + '_0_utf8.xml')
sitemap_file = os.path.join(latest, '_0_utf8.xml')
handle = self.localStore.get(self.container_id, sitemap_file, encoding="utf-8")

tree = etree.parse(handle)
Expand Down
13 changes: 6 additions & 7 deletions portality/bll/services/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def write_url_element(self, loc, lastmod=None):
self.file.write(url_ele)

def create_sitemap_file(self):
self.current_filename = f'{self.filename_prefix}_{self.file_idx}_utf8.xml'
self.current_filename = os.path.join(self.filename_prefix, f'_{self.file_idx}_utf8.xml')
self.current_file_path = os.path.join(self.temp_store, self.current_filename)
self.file = open(self.current_file_path, "w")
self.file.write('<?xml version="1.0" encoding="UTF-8"?>\n')
Expand Down Expand Up @@ -107,8 +107,7 @@ def sitemap(self, prune: bool = True):
lastmod_date = dates.now_str(FMT_DATETIME_STD)

filename_prefix = 'sitemap_doaj_' + run_start_time
cache_container_id = app.config.get("STORE_CACHE_CONTAINER")
container_id = os.path.join(cache_container_id,filename_prefix)
container_id = app.config.get("STORE_CACHE_CONTAINER")

total_static_pages = 0
total_journals_count = 0
Expand All @@ -121,7 +120,7 @@ def sitemap(self, prune: bool = True):
# temporary directory
tmp_store_dir = tmpStore.path(container_id, '', create_container=True)
# Create the directories if they don't exist
os.makedirs(tmp_store_dir, exist_ok=True)
os.makedirs(os.path.join(tmp_store_dir,filename_prefix) , exist_ok=True)

sitemap_generator = SitemapGenerator(filename_prefix, tmp_store_dir, mainStore, container_id)

Expand Down Expand Up @@ -157,7 +156,7 @@ def sitemap(self, prune: bool = True):
sitemap_generator.finalize_sitemap_file()

# Create sitemap index file
sitemap_index_filename = f'sitemap_index_doaj_{run_start_time}_utf8.xml'
sitemap_index_filename = os.path.join(filename_prefix, f'sitemap_index_utf8.xml')
sitemap_index_path = os.path.join(tmp_store_dir, sitemap_index_filename)
with open(sitemap_index_path, "w") as f:
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
Expand Down Expand Up @@ -205,8 +204,8 @@ def sort(filelist):
def _filter(filename):
return filename.startswith("sitemap_")

action_register += prune_container(mainStore, cache_container_id, sort, filter=_filter, keep=2)
action_register += prune_container(tmpStore, cache_container_id, sort, filter=_filter, keep=2)
action_register += prune_container(mainStore, container_id, sort, filter=_filter, keep=2, is_directory=True)
action_register += prune_container(tmpStore, container_id, sort, filter=_filter, keep=2)

# Update the cache record to point to the new sitemap index and all sitemaps
models.Cache.cache_sitemap(index_url)
Expand Down
51 changes: 41 additions & 10 deletions portality/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ class StoreS3(Store):
~~!FileStoreS3:Feature->S3:Technology~~
"""
def __init__(self, scope):
self.dir = None
self._cfg = app.config.get("STORE_S3_SCOPES", {}).get(scope)
multipart_threshold = app.config.get("STORE_S3_MULTIPART_THRESHOLD", 5 * 1024**3)

Expand Down Expand Up @@ -211,9 +212,10 @@ def __init__(self, scope):

def store(self, container_id, target_name, source_path=None, source_stream=None):
cpath = os.path.join(self.dir, container_id)
if not os.path.exists(cpath):
os.makedirs(cpath)
tpath = os.path.join(cpath, target_name)
directory = os.path.dirname(tpath)
if not os.path.exists(directory):
os.makedirs(directory)

if source_path:
shutil.copyfile(source_path, tpath)
Expand Down Expand Up @@ -292,10 +294,11 @@ def list_container_ids(self):
return [x for x in os.listdir(self.dir) if os.path.isdir(os.path.join(self.dir, x))]


def prune_container(storage, container_id, sort, filter=None, keep=1, logger=None):
def prune_container(storage, container_id, sort, filter=None, keep=1, logger=None, is_directory=False):
logger = logger if logger is not None else lambda x: x
action_register = []

dir_list = []
filelist = storage.list(container_id)
#action_register.append("Current cached files (before prune): " + ", ".join(filelist))

Expand All @@ -309,19 +312,47 @@ def prune_container(storage, container_id, sort, filter=None, keep=1, logger=Non
filtered = filelist
#action_register.append("Filtered cached files (before prune): " + ", ".join(filelist))

if len(filtered) <= keep:
# action_register.append("Fewer than {x} files in cache, no further action".format(x=keep))
return action_register
# treat directories differently
# s3 buckets does not have physical directories under the bucket. They are virtual directories.
# Retrieve the directories and delete all files related to the directories
if is_directory:
for fn in filtered:
dir = os.path.dirname(fn)
if dir:
dir_list.append(dir)
else:
if storage.dir:
if os.path.isdir(os.path.join(storage.dir, container_id, fn)):
dir_list.append(fn)

dir_set = set(dir_list)

if is_directory:
if len(dir_set) <= keep:
return action_register
else:
if len(filtered) <= keep:
# action_register.append("Fewer than {x} files in cache, no further action".format(x=keep))
return action_register

filtered_sorted = sort(filtered)
#action_register.append("Considering files for retention in the following order: " + ", ".join(filtered_sorted))
if is_directory:
filtered_sorted = sort(dir_set)
else:
filtered_sorted = sort(filtered)
#action_register.append("Considering files for retention in the following order: " + ", ".join(filtered_sorted))

remove = filtered_sorted[keep:]
msg = "Removed old files: " + ", ".join(remove)
action_register.append(msg)
logger(msg)

for fn in remove:
storage.delete_file(container_id, fn)
if is_directory:
for fn in remove:
for file in filtered:
if file.startswith(fn):
storage.delete_file(container_id, file)
else:
for fn in remove:
storage.delete_file(container_id, fn)

return action_register