Skip to content

Commit

Permalink
Merge pull request #35559 from openedx/timmc/course-asset-view-cleanup
Browse files Browse the repository at this point in the history
refactor: Clean up after conversion of contentserver to view
timmc-edx authored Sep 30, 2024
2 parents 06a5560 + 7b1519f commit a100166
Showing 2 changed files with 253 additions and 270 deletions.
21 changes: 10 additions & 11 deletions openedx/core/djangoapps/contentserver/test/test_contentserver.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Tests for StaticContentServer
Tests for content server.
"""


@@ -27,7 +27,7 @@
from xmodule.modulestore.tests.django_utils import TEST_DATA_SPLIT_MODULESTORE, SharedModuleStoreTestCase
from xmodule.modulestore.xml_importer import import_course_from_xml

from ..views import HTTP_DATE_FORMAT, StaticContentServer, parse_range_header
from .. import views

log = logging.getLogger(__name__)

@@ -246,7 +246,6 @@ def test_range_request_multiple_ranges(self):
"""
first_byte = self.length_unlocked / 4
last_byte = self.length_unlocked / 2
# lint-amnesty, pylint: disable=bad-option-value, unicode-format-string
resp = self.client.get(self.url_unlocked, HTTP_RANGE='bytes={first}-{last}, -100'.format(
first=first_byte, last=last_byte))

@@ -356,8 +355,8 @@ def test_cache_headers_without_ttl_locked(self, mock_get_cache_ttl):
assert 'private, no-cache, no-store' == resp['Cache-Control']

def test_get_expiration_value(self):
start_dt = datetime.datetime.strptime("Thu, 01 Dec 1983 20:00:00 GMT", HTTP_DATE_FORMAT)
near_expire_dt = StaticContentServer.get_expiration_value(start_dt, 55)
start_dt = datetime.datetime.strptime("Thu, 01 Dec 1983 20:00:00 GMT", views.HTTP_DATE_FORMAT)
near_expire_dt = views.get_expiration_value(start_dt, 55)
assert 'Thu, 01 Dec 1983 20:00:55 GMT' == near_expire_dt

@patch('openedx.core.djangoapps.contentserver.models.CdnUserAgentsConfig.get_cdn_user_agents')
@@ -371,7 +370,7 @@ def test_cache_is_cdn_with_normal_request(self, mock_get_cdn_user_agents):
request_factory = RequestFactory()
browser_request = request_factory.get('/fake', HTTP_USER_AGENT='Chrome 1234')

is_from_cdn = StaticContentServer.is_cdn_request(browser_request)
is_from_cdn = views.is_cdn_request(browser_request)
assert is_from_cdn is False

@patch('openedx.core.djangoapps.contentserver.models.CdnUserAgentsConfig.get_cdn_user_agents')
@@ -385,7 +384,7 @@ def test_cache_is_cdn_with_cdn_request(self, mock_get_cdn_user_agents):
request_factory = RequestFactory()
browser_request = request_factory.get('/fake', HTTP_USER_AGENT='Amazon CloudFront')

is_from_cdn = StaticContentServer.is_cdn_request(browser_request)
is_from_cdn = views.is_cdn_request(browser_request)
assert is_from_cdn is True

@patch('openedx.core.djangoapps.contentserver.models.CdnUserAgentsConfig.get_cdn_user_agents')
@@ -400,7 +399,7 @@ def test_cache_is_cdn_with_cdn_request_multiple_user_agents(self, mock_get_cdn_u
request_factory = RequestFactory()
browser_request = request_factory.get('/fake', HTTP_USER_AGENT='Amazon CloudFront')

is_from_cdn = StaticContentServer.is_cdn_request(browser_request)
is_from_cdn = views.is_cdn_request(browser_request)
assert is_from_cdn is True


@@ -415,7 +414,7 @@ def setUp(self):
self.content_length = 10000

def test_bytes_unit(self):
unit, __ = parse_range_header('bytes=100-', self.content_length)
unit, __ = views.parse_range_header('bytes=100-', self.content_length)
assert unit == 'bytes'

@ddt.data(
@@ -428,7 +427,7 @@ def test_bytes_unit(self):
)
@ddt.unpack
def test_valid_syntax(self, header_value, excepted_ranges_length, expected_ranges):
__, ranges = parse_range_header(header_value, self.content_length)
__, ranges = views.parse_range_header(header_value, self.content_length)
assert len(ranges) == excepted_ranges_length
assert ranges == expected_ranges

@@ -446,5 +445,5 @@ def test_valid_syntax(self, header_value, excepted_ranges_length, expected_range
@ddt.unpack
def test_invalid_syntax(self, header_value, exception_class, exception_message_regex):
self.assertRaisesRegex(
exception_class, exception_message_regex, parse_range_header, header_value, self.content_length
exception_class, exception_message_regex, views.parse_range_header, header_value, self.content_length
)
502 changes: 243 additions & 259 deletions openedx/core/djangoapps/contentserver/views.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,9 @@
"""
Views for serving course assets.
Historically, this was implemented as a *middleware* (StaticContentServer) that
intercepted requests with paths matching certain patterns, rather than using
urlpatterns and a view. There wasn't any good reason for this, as far as I can
tell. It causes some problems for telemetry: When the code-owner middleware asks
Django what view handled the request, it does so by looking at the result of the
`resolve` utility, but these URLs get a Resolver404 (because there's no
registered urlpattern).
We've turned it into a proper view, with a few warts remaining:
- The view implementation is all bundled into a StaticContentServer class that
doesn't appear to have any state. The methods could likely just be extracted
as top-level functions.
- All three urlpatterns are registered to the same view, which then has to
re-parse the URL to determine which pattern is in effect. We should probably
have 3 views as entry points.
For historical reasons, this is just one view that matches to three different URL patterns, and then has to
re-parse the URL to determine which pattern is in effect. We should probably
have 3 views as entry points.
"""
import datetime
import logging
@@ -51,7 +38,7 @@ def course_assets_view(request):
"""
Serve course assets to end users. Colloquially referred to as "contentserver."
"""
return IMPL.process_request(request)
return process_request(request)


log = logging.getLogger(__name__)
@@ -62,264 +49,261 @@ def course_assets_view(request):
HTTP_DATE_FORMAT = "%a, %d %b %Y %H:%M:%S GMT"


class StaticContentServer():
"""
Serves course assets to end users. Colloquially referred to as "contentserver."
"""
def is_asset_request(self, request):
"""Determines whether the given request is an asset request"""
# Don't change this without updating urls.py! See docstring of views.py.
return (
request.path.startswith('/' + XASSET_LOCATION_TAG + '/')
or
request.path.startswith('/' + AssetLocator.CANONICAL_NAMESPACE)
or
StaticContent.is_versioned_asset_path(request.path)
)

# pylint: disable=too-many-statements
def process_request(self, request):
"""Process the given request"""
asset_path = request.path

if self.is_asset_request(request): # lint-amnesty, pylint: disable=too-many-nested-blocks
# Make sure we can convert this request into a location.
if AssetLocator.CANONICAL_NAMESPACE in asset_path:
asset_path = asset_path.replace('block/', 'block@', 1)

# If this is a versioned request, pull out the digest and chop off the prefix.
requested_digest = None
if StaticContent.is_versioned_asset_path(asset_path):
requested_digest, asset_path = StaticContent.parse_versioned_asset_path(asset_path)

# Make sure we have a valid location value for this asset.
try:
loc = StaticContent.get_location_from_path(asset_path)
except (InvalidLocationError, InvalidKeyError):
return HttpResponseBadRequest()

# Attempt to load the asset to make sure it exists, and grab the asset digest
# if we're able to load it.
actual_digest = None
def is_asset_request(request):
"""Determines whether the given request is an asset request"""
# Don't change this without updating urls.py! See docstring of views.py.
return (
request.path.startswith('/' + XASSET_LOCATION_TAG + '/')
or
request.path.startswith('/' + AssetLocator.CANONICAL_NAMESPACE)
or
StaticContent.is_versioned_asset_path(request.path)
)


# pylint: disable=too-many-statements
def process_request(request):
"""Process the given request"""
asset_path = request.path

if is_asset_request(request):
# Make sure we can convert this request into a location.
if AssetLocator.CANONICAL_NAMESPACE in asset_path:
asset_path = asset_path.replace('block/', 'block@', 1)

# If this is a versioned request, pull out the digest and chop off the prefix.
requested_digest = None
if StaticContent.is_versioned_asset_path(asset_path):
requested_digest, asset_path = StaticContent.parse_versioned_asset_path(asset_path)

# Make sure we have a valid location value for this asset.
try:
loc = StaticContent.get_location_from_path(asset_path)
except (InvalidLocationError, InvalidKeyError):
return HttpResponseBadRequest()

# Attempt to load the asset to make sure it exists, and grab the asset digest
# if we're able to load it.
actual_digest = None
try:
content = load_asset_from_location(loc)
actual_digest = getattr(content, "content_digest", None)
except (ItemNotFoundError, NotFoundError):
return HttpResponseNotFound()

# If this was a versioned asset, and the digest doesn't match, redirect
# them to the actual version.
if requested_digest is not None and actual_digest is not None and (actual_digest != requested_digest):
actual_asset_path = StaticContent.add_version_to_asset_path(asset_path, actual_digest)
return HttpResponsePermanentRedirect(actual_asset_path)

# Set the basics for this request. Make sure that the course key for this
# asset has a run, which old-style courses do not. Otherwise, this will
# explode when the key is serialized to be sent to NR.
safe_course_key = loc.course_key
if safe_course_key.run is None:
safe_course_key = safe_course_key.replace(run='only')

set_custom_attribute('course_id', safe_course_key)
set_custom_attribute('org', loc.org)
set_custom_attribute('contentserver.path', loc.path)

# Figure out if this is a CDN using us as the origin.
is_from_cdn = is_cdn_request(request)
set_custom_attribute('contentserver.from_cdn', is_from_cdn)

# Check if this content is locked or not.
locked = is_content_locked(content)
set_custom_attribute('contentserver.locked', locked)

# Check that user has access to the content.
if not is_user_authorized(request, content, loc):
return HttpResponseForbidden('Unauthorized')

# Figure out if the client sent us a conditional request, and let them know
# if this asset has changed since then.
last_modified_at_str = content.last_modified_at.strftime(HTTP_DATE_FORMAT)
if 'HTTP_IF_MODIFIED_SINCE' in request.META:
if_modified_since = request.META['HTTP_IF_MODIFIED_SINCE']
if if_modified_since == last_modified_at_str:
return HttpResponseNotModified()

# *** File streaming within a byte range ***
# If a Range is provided, parse Range attribute of the request
# Add Content-Range in the response if Range is structurally correct
# Request -> Range attribute structure: "Range: bytes=first-[last]"
# Response -> Content-Range attribute structure: "Content-Range: bytes first-last/totalLength"
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
response = None
if request.META.get('HTTP_RANGE'):
# If we have a StaticContent, get a StaticContentStream. Can't manipulate the bytes otherwise.
if isinstance(content, StaticContent):
content = AssetManager.find(loc, as_stream=True)

header_value = request.META['HTTP_RANGE']
try:
content = self.load_asset_from_location(loc)
actual_digest = getattr(content, "content_digest", None)
except (ItemNotFoundError, NotFoundError):
return HttpResponseNotFound()

# If this was a versioned asset, and the digest doesn't match, redirect
# them to the actual version.
if requested_digest is not None and actual_digest is not None and (actual_digest != requested_digest):
actual_asset_path = StaticContent.add_version_to_asset_path(asset_path, actual_digest)
return HttpResponsePermanentRedirect(actual_asset_path)

# Set the basics for this request. Make sure that the course key for this
# asset has a run, which old-style courses do not. Otherwise, this will
# explode when the key is serialized to be sent to NR.
safe_course_key = loc.course_key
if safe_course_key.run is None:
safe_course_key = safe_course_key.replace(run='only')

set_custom_attribute('course_id', safe_course_key)
set_custom_attribute('org', loc.org)
set_custom_attribute('contentserver.path', loc.path)

# Figure out if this is a CDN using us as the origin.
is_from_cdn = StaticContentServer.is_cdn_request(request)
set_custom_attribute('contentserver.from_cdn', is_from_cdn)

# Check if this content is locked or not.
locked = self.is_content_locked(content)
set_custom_attribute('contentserver.locked', locked)

# Check that user has access to the content.
if not self.is_user_authorized(request, content, loc):
return HttpResponseForbidden('Unauthorized')

# Figure out if the client sent us a conditional request, and let them know
# if this asset has changed since then.
last_modified_at_str = content.last_modified_at.strftime(HTTP_DATE_FORMAT)
if 'HTTP_IF_MODIFIED_SINCE' in request.META:
if_modified_since = request.META['HTTP_IF_MODIFIED_SINCE']
if if_modified_since == last_modified_at_str:
return HttpResponseNotModified()

# *** File streaming within a byte range ***
# If a Range is provided, parse Range attribute of the request
# Add Content-Range in the response if Range is structurally correct
# Request -> Range attribute structure: "Range: bytes=first-[last]"
# Response -> Content-Range attribute structure: "Content-Range: bytes first-last/totalLength"
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
response = None
if request.META.get('HTTP_RANGE'):
# If we have a StaticContent, get a StaticContentStream. Can't manipulate the bytes otherwise.
if isinstance(content, StaticContent):
content = AssetManager.find(loc, as_stream=True)

header_value = request.META['HTTP_RANGE']
try:
unit, ranges = parse_range_header(header_value, content.length)
except ValueError as exception:
# If the header field is syntactically invalid it should be ignored.
log.exception(
"%s in Range header: %s for content: %s",
str(exception), header_value, str(loc)
unit, ranges = parse_range_header(header_value, content.length)
except ValueError as exception:
# If the header field is syntactically invalid it should be ignored.
log.exception(
"%s in Range header: %s for content: %s",
str(exception), header_value, str(loc)
)
else:
if unit != 'bytes':
# Only accept ranges in bytes
log.warning("Unknown unit in Range header: %s for content: %s", header_value, str(loc))
elif len(ranges) > 1:
# According to Http/1.1 spec content for multiple ranges should be sent as a multipart message.
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.16
# But we send back the full content.
log.warning(
"More than 1 ranges in Range header: %s for content: %s", header_value, str(loc)
)
else:
if unit != 'bytes':
# Only accept ranges in bytes
log.warning("Unknown unit in Range header: %s for content: %s", header_value, str(loc))
elif len(ranges) > 1:
# According to Http/1.1 spec content for multiple ranges should be sent as a multipart message.
# http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.16
# But we send back the full content.
log.warning(
"More than 1 ranges in Range header: %s for content: %s", header_value, str(loc)
first, last = ranges[0]

if 0 <= first <= last < content.length:
# If the byte range is satisfiable
response = HttpResponse(content.stream_data_in_range(first, last))
response['Content-Range'] = 'bytes {first}-{last}/{length}'.format(
first=first, last=last, length=content.length
)
response['Content-Length'] = str(last - first + 1)
response.status_code = 206 # Partial Content

set_custom_attribute('contentserver.ranged', True)
else:
first, last = ranges[0]

if 0 <= first <= last < content.length:
# If the byte range is satisfiable
response = HttpResponse(content.stream_data_in_range(first, last))
response['Content-Range'] = 'bytes {first}-{last}/{length}'.format(
first=first, last=last, length=content.length
)
response['Content-Length'] = str(last - first + 1)
response.status_code = 206 # Partial Content

set_custom_attribute('contentserver.ranged', True)
else:
log.warning(
"Cannot satisfy ranges in Range header: %s for content: %s",
header_value, str(loc)
)
return HttpResponse(status=416) # Requested Range Not Satisfiable

# If Range header is absent or syntactically invalid return a full content response.
if response is None:
response = HttpResponse(content.stream_data())
response['Content-Length'] = content.length

set_custom_attribute('contentserver.content_len', content.length)
set_custom_attribute('contentserver.content_type', content.content_type)

# "Accept-Ranges: bytes" tells the user that only "bytes" ranges are allowed
response['Accept-Ranges'] = 'bytes'
response['Content-Type'] = content.content_type
response['X-Frame-Options'] = 'ALLOW'

# Set any caching headers, and do any response cleanup needed. Based on how much
# middleware we have in place, there's no easy way to use the built-in Django
# utilities and properly sanitize and modify a response to ensure that it is as
# cacheable as possible, which is why we do it ourselves.
self.set_caching_headers(content, response)

return response

def set_caching_headers(self, content, response):
"""
Sets caching headers based on whether or not the asset is locked.
"""

is_locked = getattr(content, "locked", False)

# We want to signal to the end user's browser, and to any intermediate proxies/caches,
# whether or not this asset is cacheable. If we have a TTL configured, we inform the
# caller, for unlocked assets, how long they are allowed to cache it. Since locked
# assets should be restricted to enrolled students, we simply send headers that
# indicate there should be no caching whatsoever.
cache_ttl = CourseAssetCacheTtlConfig.get_cache_ttl()
if cache_ttl > 0 and not is_locked:
set_custom_attribute('contentserver.cacheable', True)

response['Expires'] = StaticContentServer.get_expiration_value(datetime.datetime.utcnow(), cache_ttl)
response['Cache-Control'] = "public, max-age={ttl}, s-maxage={ttl}".format(ttl=cache_ttl)
elif is_locked:
set_custom_attribute('contentserver.cacheable', False)

response['Cache-Control'] = "private, no-cache, no-store"

response['Last-Modified'] = content.last_modified_at.strftime(HTTP_DATE_FORMAT)

# Force the Vary header to only vary responses on Origin, so that XHR and browser requests get cached
# separately and don't screw over one another. i.e. a browser request that doesn't send Origin, and
# caches a version of the response without CORS headers, in turn breaking XHR requests.
force_header_for_response(response, 'Vary', 'Origin')

@staticmethod
def is_cdn_request(request):
"""
Attempts to determine whether or not the given request is coming from a CDN.
Currently, this is a static check because edx.org only uses CloudFront, but may
be expanded in the future.
"""
cdn_user_agents = CdnUserAgentsConfig.get_cdn_user_agents()
user_agent = request.META.get('HTTP_USER_AGENT', '')
if user_agent in cdn_user_agents:
# This is a CDN request.
return True
log.warning(
"Cannot satisfy ranges in Range header: %s for content: %s",
header_value, str(loc)
)
return HttpResponse(status=416) # Requested Range Not Satisfiable

return False
# If Range header is absent or syntactically invalid return a full content response.
if response is None:
response = HttpResponse(content.stream_data())
response['Content-Length'] = content.length

@staticmethod
def get_expiration_value(now, cache_ttl):
"""Generates an RFC1123 datetime string based on a future offset."""
expire_dt = now + datetime.timedelta(seconds=cache_ttl)
return expire_dt.strftime(HTTP_DATE_FORMAT)

def is_content_locked(self, content):
"""
Determines whether or not the given content is locked.
"""
return bool(getattr(content, "locked", False))

def is_user_authorized(self, request, content, location):
"""
Determines whether or not the user for this request is authorized to view the given asset.
"""
if not self.is_content_locked(content):
return True

if not hasattr(request, "user") or not request.user.is_authenticated:
return False
set_custom_attribute('contentserver.content_len', content.length)
set_custom_attribute('contentserver.content_type', content.content_type)

# "Accept-Ranges: bytes" tells the user that only "bytes" ranges are allowed
response['Accept-Ranges'] = 'bytes'
response['Content-Type'] = content.content_type
response['X-Frame-Options'] = 'ALLOW'

# Set any caching headers, and do any response cleanup needed. Based on how much
# middleware we have in place, there's no easy way to use the built-in Django
# utilities and properly sanitize and modify a response to ensure that it is as
# cacheable as possible, which is why we do it ourselves.
set_caching_headers(content, response)

return response


def set_caching_headers(content, response):
"""
Sets caching headers based on whether or not the asset is locked.
"""

is_locked = getattr(content, "locked", False)

# We want to signal to the end user's browser, and to any intermediate proxies/caches,
# whether or not this asset is cacheable. If we have a TTL configured, we inform the
# caller, for unlocked assets, how long they are allowed to cache it. Since locked
# assets should be restricted to enrolled students, we simply send headers that
# indicate there should be no caching whatsoever.
cache_ttl = CourseAssetCacheTtlConfig.get_cache_ttl()
if cache_ttl > 0 and not is_locked:
set_custom_attribute('contentserver.cacheable', True)

response['Expires'] = get_expiration_value(datetime.datetime.utcnow(), cache_ttl)
response['Cache-Control'] = "public, max-age={ttl}, s-maxage={ttl}".format(ttl=cache_ttl)
elif is_locked:
set_custom_attribute('contentserver.cacheable', False)

response['Cache-Control'] = "private, no-cache, no-store"

response['Last-Modified'] = content.last_modified_at.strftime(HTTP_DATE_FORMAT)

if not request.user.is_staff:
deprecated = getattr(location, 'deprecated', False)
if deprecated and not CourseEnrollment.is_enrolled_by_partial(request.user, location.course_key):
return False
if not deprecated and not CourseEnrollment.is_enrolled(request.user, location.course_key):
return False
# Force the Vary header to only vary responses on Origin, so that XHR and browser requests get cached
# separately and don't screw over one another. i.e. a browser request that doesn't send Origin, and
# caches a version of the response without CORS headers, in turn breaking XHR requests.
force_header_for_response(response, 'Vary', 'Origin')


@staticmethod
def is_cdn_request(request):
"""
Attempts to determine whether or not the given request is coming from a CDN.
Currently, this is a static check because edx.org only uses CloudFront, but may
be expanded in the future.
"""
cdn_user_agents = CdnUserAgentsConfig.get_cdn_user_agents()
user_agent = request.META.get('HTTP_USER_AGENT', '')
if user_agent in cdn_user_agents:
# This is a CDN request.
return True

def load_asset_from_location(self, location):
"""
Loads an asset based on its location, either retrieving it from a cache
or loading it directly from the contentstore.
"""
return False

# See if we can load this item from cache.
content = get_cached_content(location)
if content is None:
# Not in cache, so just try and load it from the asset manager.
try:
content = AssetManager.find(location, as_stream=True)
except (ItemNotFoundError, NotFoundError): # lint-amnesty, pylint: disable=try-except-raise
raise

# Now that we fetched it, let's go ahead and try to cache it. We cap this at 1MB
# because it's the default for memcached and also we don't want to do too much
# buffering in memory when we're serving an actual request.
if content.length is not None and content.length < 1048576:
content = content.copy_to_in_mem()
set_cached_content(content)
@staticmethod
def get_expiration_value(now, cache_ttl):
"""Generates an RFC1123 datetime string based on a future offset."""
expire_dt = now + datetime.timedelta(seconds=cache_ttl)
return expire_dt.strftime(HTTP_DATE_FORMAT)


def is_content_locked(content):
"""
Determines whether or not the given content is locked.
"""
return bool(getattr(content, "locked", False))


def is_user_authorized(request, content, location):
"""
Determines whether or not the user for this request is authorized to view the given asset.
"""
if not is_content_locked(content):
return True

if not hasattr(request, "user") or not request.user.is_authenticated:
return False

if not request.user.is_staff:
deprecated = getattr(location, 'deprecated', False)
if deprecated and not CourseEnrollment.is_enrolled_by_partial(request.user, location.course_key):
return False
if not deprecated and not CourseEnrollment.is_enrolled(request.user, location.course_key):
return False

return True


def load_asset_from_location(location):
"""
Loads an asset based on its location, either retrieving it from a cache
or loading it directly from the contentstore.
"""

return content
# See if we can load this item from cache.
content = get_cached_content(location)
if content is None:
# Not in cache, so just try and load it from the asset manager.
content = AssetManager.find(location, as_stream=True)

# Now that we fetched it, let's go ahead and try to cache it. We cap this at 1MB
# because it's the default for memcached and also we don't want to do too much
# buffering in memory when we're serving an actual request.
if content.length is not None and content.length < 1048576:
content = content.copy_to_in_mem()
set_cached_content(content)

IMPL = StaticContentServer()
return content


def parse_range_header(header_value, content_length):

0 comments on commit a100166

Please sign in to comment.