Skip to content

Commit

Permalink
Merge branch '2u/course-optimizer' into 2u/optimizer-tests
Browse files Browse the repository at this point in the history
  • Loading branch information
rayzhou-bit authored Jan 13, 2025
2 parents 81619f8 + de1aa1d commit ebee1d9
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 151 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ def generate_broken_links_descriptor(json_content, request_user):

for item in json_content:
block_id, link, *rest = item
is_locked_flag = bool(rest[0])
if rest:
is_locked_flag = bool(rest[0])
else:
is_locked_flag = False

usage_key = usage_key_with_run(block_id)
block = get_xblock(usage_key, request_user)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,5 @@ class LinkCheckSerializer(serializers.Serializer):
""" Serializer for broken links """
LinkCheckStatus = serializers.CharField(required=True)
LinkCheckCreatedAt = serializers.DateTimeField(required=False)
LinkCheckOutput = LinkCheckOutputSerializer(required=True)
LinkCheckOutput = LinkCheckOutputSerializer(required=False)
LinkCheckError = serializers.CharField(required=False)
275 changes: 139 additions & 136 deletions cms/djangoapps/contentstore/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1104,160 +1104,163 @@ def generate_name(cls, arguments_dict):
key = arguments_dict['course_key_string']
return f'Broken link check of {key}'

# -------------- Course optimizer functions ------------------

@shared_task(base=CourseLinkCheckTask, bind=True)
def check_broken_links(self, user_id, course_key_string, language):
def _validate_user(task, user_id, language):
"""Validate if the user exists. Otherwise log error. """
try:
return User.objects.get(pk=user_id)
except User.DoesNotExist as exc:
with translation_language(language):
task.status.fail(UserErrors.UNKNOWN_USER_ID.format(user_id))
return

def _get_urls(content):
"""
Checks for broken links in a course. Store the results in a file.
Returns all urls found after href and src in content.
Excludes urls that are only '#'.
"""
def validate_user():
"""Validate if the user exists. Otherwise log error. """
try:
return User.objects.get(pk=user_id)
except User.DoesNotExist as exc:
with translation_language(language):
self.status.fail(UserErrors.UNKNOWN_USER_ID.format(user_id))
return
regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
url_list = re.findall(regex, content)
return url_list

def get_urls(content):
"""
Returns all urls found after href and src in content.
Excludes urls that are only '#'.
"""
regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
url_list = re.findall(regex, content)
return url_list

def is_studio_url(url):
"""Returns True if url is a studio url."""
return not url.startswith('http://') and not url.startswith('https://')

def convert_to_standard_url(url, course_key):
"""
Returns standard urls when given studio urls. Otherwise return url as is.
Example urls:
/assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png
/static/getting-started_x250.png
/container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7
"""
if is_studio_url(url):
if url.startswith('/static/'):
processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1]
return 'http://' + settings.CMS_BASE + processed_url
elif url.startswith('/'):
return 'http://' + settings.CMS_BASE + url
else:
return 'http://' + settings.CMS_BASE + '/container/' + url
def _is_studio_url(url):
"""Returns True if url is a studio url."""
return not url.startswith('http://') and not url.startswith('https://')

def _convert_to_standard_url(url, course_key):
"""
Returns standard urls when given studio urls. Otherwise return url as is.
Example urls:
/assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png
/static/getting-started_x250.png
/container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7
"""
if _is_studio_url(url):
if url.startswith('/static/'):
processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1]
return 'http://' + settings.CMS_BASE + processed_url
elif url.startswith('/'):
return 'http://' + settings.CMS_BASE + url
else:
return url
return 'http://' + settings.CMS_BASE + '/container/' + url
else:
return url

def scan_course_for_links(course_key):
"""
Returns a list of all urls in a course.
Returns: [ [block_id1, url1], [block_id2, url2], ... ]
"""
verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only)
blocks = []
urls_to_validate = []
def _scan_course_for_links(course_key):
"""
Returns a list of all urls in a course.
Returns: [ [block_id1, url1], [block_id2, url2], ... ]
"""
verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'},
revision=ModuleStoreEnum.RevisionOption.published_only)
blocks = []
urls_to_validate = []

for vertical in verticals:
blocks.extend(vertical.get_children())
for vertical in verticals:
blocks.extend(vertical.get_children())

for block in blocks:
block_id = str(block.usage_key)
block_info = get_block_info(block)
block_data = block_info['data']
for block in blocks:
block_id = str(block.usage_key)
block_info = get_block_info(block)
block_data = block_info['data']

url_list = get_urls(block_data)
urls_to_validate += [[block_id, url] for url in url_list]
url_list = _get_urls(block_data)
urls_to_validate += [[block_id, url] for url in url_list]

return urls_to_validate
return urls_to_validate

async def validate_url_access(session, url_data, course_key):
"""
Returns the status of a url request
Returns: {block_id1, url1, status}
"""
block_id, url = url_data
result = {'block_id': block_id, 'url': url}
standardized_url = convert_to_standard_url(url, course_key)
try:
async with session.get(standardized_url, timeout=5) as response:
result.update({'status': response.status})
except Exception as e:
result.update({'status': None})
LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}')
return result

async def validate_urls_access_in_batches(url_list, course_key, batch_size=100):
"""
Returns the statuses of a list of url requests.
Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ]
"""
responses = []
url_count = len(url_list)

for i in range(0, url_count, batch_size):
batch = url_list[i:i + batch_size]
async with aiohttp.ClientSession() as session:
tasks = [validate_url_access(session, url_data, course_key) for url_data in batch]
batch_results = await asyncio.gather(*tasks)
responses.extend(batch_results)
LOGGER.debug(f'[Link Check] request batch {i // batch_size+1} of {url_count // batch_size + 1}')

return responses

def retry_validation(url_list, course_key, retry_count=3):
"""Retry urls that failed due to connection error."""
results = []
retry_list = url_list
for i in range(0, retry_count):
if retry_list:
LOGGER.debug(f'[Link Check] retry attempt #{i+1}')
validated_url_list = asyncio.run(validate_urls_access_in_batches(retry_list, course_key, batch_size=100))
filetered_url_list, retry_list = filter_by_status(validated_url_list)
results.extend(filetered_url_list)

async def _validate_url_access(session, url_data, course_key):
"""
Returns the status of a url request
Returns: {block_id1, url1, status}
"""
block_id, url = url_data
result = {'block_id': block_id, 'url': url}
standardized_url = _convert_to_standard_url(url, course_key)
try:
async with session.get(standardized_url, timeout=5) as response:
result.update({'status': response.status})
except Exception as e:
result.update({'status': None})
LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}')
return result

async def _validate_urls_access_in_batches(url_list, course_key, batch_size=100):
"""
Returns the statuses of a list of url requests.
Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ]
"""
responses = []
url_count = len(url_list)

for i in range(0, url_count, batch_size):
batch = url_list[i:i + batch_size]
async with aiohttp.ClientSession() as session:
tasks = [_validate_url_access(session, url_data, course_key) for url_data in batch]
batch_results = await asyncio.gather(*tasks)
responses.extend(batch_results)
LOGGER.debug(f'[Link Check] request batch {i // batch_size + 1} of {url_count // batch_size + 1}')

return responses

def _retry_validation(url_list, course_key, retry_count=3):
"""Retry urls that failed due to connection error."""
results = []
retry_list = url_list
for i in range(0, retry_count):
if retry_list:
LOGGER.debug(f'[Link Check] {len(retry_list)} requests failed due to connection error')
LOGGER.debug(f'[Link Check] retry attempt #{i + 1}')
validated_url_list = asyncio.run(
_validate_urls_access_in_batches(retry_list, course_key, batch_size=100)
)
filetered_url_list, retry_list = _filter_by_status(validated_url_list)
results.extend(filetered_url_list)

return results
results.extend(retry_list)

def filter_by_status(results):
"""
Filter results by status.
200: OK. No need to do more
403: Forbidden. Record as locked link.
None: Error. Retry up to 3 times.
Other: Failure. Record as broken link.
Returns:
filtered_results: [ [block_id1, url1, is_locked], ... ]
retry_list: [ [block_id1, url1], ... ]
"""
filtered_results = []
retry_list = []
for result in results:
if result['status'] == None:
retry_list.append([result['block_id'], result['url']])
elif result['status'] == 200:
continue
elif result['status'] == 403 and is_studio_url(result['url']):
filtered_results.append([result['block_id'], result['url'], True])
else:
filtered_results.append([result['block_id'], result['url'], False])

return filtered_results, retry_list
return results

user = validate_user()
def _filter_by_status(results):
"""
Filter results by status.
200: OK. No need to do more
403: Forbidden. Record as locked link.
None: Error. Retry up to 3 times.
Other: Failure. Record as broken link.
Returns:
filtered_results: [ [block_id1, url1, is_locked], ... ]
retry_list: [ [block_id1, url1], ... ]
"""
filtered_results = []
retry_list = []
for result in results:
if result['status'] is None:
retry_list.append([result['block_id'], result['url']])
elif result['status'] == 200:
continue
elif result['status'] == 403 and _is_studio_url(result['url']):
filtered_results.append([result['block_id'], result['url'], True])
else:
filtered_results.append([result['block_id'], result['url'], False])

return filtered_results, retry_list

@shared_task(base=CourseLinkCheckTask, bind=True)
def check_broken_links(self, user_id, course_key_string, language):
"""
Checks for broken links in a course. Store the results in a file.
"""
user = _validate_user(self, user_id, language)

self.status.set_state('Scanning')
course_key = CourseKey.from_string(course_key_string)
url_list = scan_course_for_links(course_key)
validated_url_list = asyncio.run(validate_urls_access_in_batches(url_list, course_key, batch_size=100))
broken_or_locked_urls, retry_list = filter_by_status(validated_url_list)
url_list = _scan_course_for_links(course_key)
validated_url_list = asyncio.run(_validate_urls_access_in_batches(url_list, course_key, batch_size=100))
broken_or_locked_urls, retry_list = _filter_by_status(validated_url_list)

if retry_list:
retry_results = retry_validation(retry_list, course_key, retry_count=3)
retry_results = _retry_validation(retry_list, course_key, retry_count=3)
broken_or_locked_urls.extend(retry_results)

try:
Expand All @@ -1273,7 +1276,7 @@ def filter_by_status(results):
artifact = UserTaskArtifact(status=self.status, name='BrokenLinks')
artifact.file.save(name=os.path.basename(broken_links_file.name), content=File(broken_links_file))
artifact.save()

# catch all exceptions so we can record useful error messages
except Exception as e: # pylint: disable=broad-except
LOGGER.exception('Error checking links for course %s', course_key, exc_info=True)
Expand Down
Loading

0 comments on commit ebee1d9

Please sign in to comment.