Skip to content

Commit

Permalink
[1344] WMS links detected as broken - Archiver now queries WMS and WF…
Browse files Browse the repository at this point in the history
…S ok.
  • Loading branch information
David Read committed Feb 21, 2014
1 parent 15181a2 commit d067f41
Show file tree
Hide file tree
Showing 6 changed files with 3,664 additions and 67 deletions.
137 changes: 111 additions & 26 deletions ckanext/archiver/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import traceback
import shutil
import datetime
import copy

from requests.packages import urllib3

Expand Down Expand Up @@ -68,23 +69,26 @@ def _clean_content_type(ct):
return ct

def download(context, resource, url_timeout=30,
max_content_length=settings.MAX_CONTENT_LENGTH):
max_content_length=settings.MAX_CONTENT_LENGTH,
method='GET'):
'''Given a resource, tries to download it.
Params:
resource - dict of the resource
Exceptions from link_checker may be propagated:
Exceptions from tidy_url may be propagated:
LinkInvalidError if the URL is invalid
LinkHeadRequestError if HEAD request fails
If there is an error performing the download, raises:
DownloadException - connection problems etc.
DownloadError - HTTP status code is an error
DownloadError - HTTP status code is an error or 0 length
If download is not suitable (e.g. too large), raises:
ChooseNotToDownload
If the basic GET fails then it will try it with common API
parameters (SPARQL, WMS etc) to get a better response.
Returns a dict of results of a successful download:
length, hash, headers, saved_file, url_redirected_to
Updates the resource values for: mimetype, size, hash
Expand All @@ -102,7 +106,8 @@ def download(context, resource, url_timeout=30,

# start the download - just get the headers
# May raise DownloadException
res = convert_requests_exceptions(requests.get, url, timeout=url_timeout)
method_func = {'GET': requests.get, 'POST': requests.post}[method]
res = convert_requests_exceptions(method_func, url, timeout=url_timeout)
url_redirected_to = res.url if url != res.url else None
if not res.ok: # i.e. 404 or something
raise DownloadError('Server reported status error: %s %s' % \
Expand Down Expand Up @@ -152,6 +157,11 @@ def get_content():
return res.content
content = convert_requests_exceptions(get_content)

if response_is_an_api_error(content):
raise DownloadError('Server content contained an API error message: %s' % \
content[:250],
url_redirected_to)

if len(content) > max_content_length:
raise ChooseNotToDownload("Content-length %s exceeds maximum allowed value %s" %
(content_length, max_content_length),
Expand Down Expand Up @@ -210,7 +220,8 @@ def get_content():
'hash': hash,
'headers': res.headers,
'saved_file': saved_file_path,
'url_redirected_to': url_redirected_to}
'url_redirected_to': url_redirected_to,
'request_type': method}


@celery.task(name="archiver.clean")
Expand Down Expand Up @@ -323,39 +334,39 @@ def _save_status(has_passed, status_txt, exception, status, resource_id, url_red

log.info("Attempting to download resource: %s" % resource['url'])
result = None
download_error = 0
try:
result = download(context, resource)
if result is None:
raise ArchiverError("Download failed")
except LinkInvalidError, e:
log.info('URL invalid: %r, %r', e, e.args)
_save_status(False, 'URL invalid', e, status, resource['id'])
return
except LinkHeadRequestError, e:
log.info('Link head request error: %s', e.args)
_save_status(False, 'URL request failed', e, status, resource['id'])
return
download_error = 'URL invalid'
try_as_api = False
except DownloadException, e:
log.info('Server communication error: %r, %r', e, e.args)
_save_status(False, 'Download error', e, status, resource['id'])
return
download_error = 'Download error'
try_as_api = True
except DownloadError, e:
log.info('Download failed: %r, %r', e, e.args)
_save_status(False, 'Download error', e, status, resource['id'],
e.url_redirected_to)
return
download_error = 'Download error'
try_as_api = True
except ChooseNotToDownload, e:
log.info('Download not carried out: %r, %r', e, e.args)
_save_status(False, 'Chose not to download', e, status, resource['id'],
e.url_redirected_to)
return
download_error = 'Chose not to download'
try_as_api = False
except Exception, e:
if os.environ.get('DEBUG'):
raise
log.error('Uncaught download failure: %r, %r', e, e.args)
_save_status(False, 'Download failure', e, status, resource['id'])
return

if download_error:
log.info('GET error: %s - %r, %r "%s"', download_error, e, e.args, resource.get('url'))
if try_as_api:
result = api_request(context, resource)

if not try_as_api or not result:
extra_args = [e.url_redirected_to] if 'url_redirected_to' in e else []
_save_status(False, download_error, e, status, resource['id'],
*extra_args)
return

log.info('Attempting to archive resource')
try:
file_path = archive_resource(context, resource, log, result)
Expand Down Expand Up @@ -752,3 +763,77 @@ def convert_requests_exceptions(func, *args, **kwargs):
raise
raise DownloadException('Error with the download: %s' % e)
return response

def set_ogc_url_params(resource, service, wms_version):
url = resource['url']
# Remove parameters
url = url.split('?')[0]
# Add WMS GetCapabilities parameters
url += '?service=%s&request=GetCapabilities&version=%s' % (service, wms_version)
resource['url'] = url

def wms_1_3_request(context, resource):
set_ogc_url_params(resource, 'WMS', '1.3')
res = download(context, resource)
res['request_type'] = 'WMS 1.3'
return res

def wms_1_1_1_request(context, resource):
set_ogc_url_params(resource, 'WMS', '1.1.1')
res = download(context, resource)
res['request_type'] = 'WMS 1.1.1'
return res

def wfs_request(context, resource):
set_ogc_url_params(resource, 'WFS', '2.0')
res = download(context, resource)
res['request_type'] = 'WFS 2.0'
return res

def api_request(context, resource):
'''
Tries making requests as if the resource is a well-known sort of API to try
and get a valid response. If it does it returns the response, otherwise Archives the response and stores what sort of
request elicited it.
'''
log = update.get_logger()
# 'resource' holds the results of the download and will get saved. Only if
# an API request is successful do we want to save the details of it.
# However download() gets altered for these API requests. So only give
# download() a copy of 'resource'.
for api_request_func in wms_1_3_request, wms_1_1_1_request, wfs_request:
resource_copy = copy.deepcopy(resource)
try:
download_dict = api_request_func(context, resource_copy)
except ArchiverError, e:
log.info('API %s error: %r, %r "%s"', api_request_func,
e, e.args, resource.get('url'))
continue
except Exception, e:
if os.environ.get('DEBUG'):
raise
log.error('Uncaught API %s failure: %r, %r', api_request_func,
e, e.args)
continue

return download_dict

def response_is_an_api_error(response_body):
'''Some APIs return errors as the response body, but HTTP status 200. So we
need to check response bodies for these error messages.
'''
response_sample = response_body[:250] # to allow for <?xml> and <!DOCTYPE> lines

# WMS spec
# e.g. https://map.bgs.ac.uk/ArcGIS/services/BGS_Detailed_Geology/MapServer/WMSServer?service=abc
# <?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
# <ServiceExceptionReport version="1.3.0"
if '<ServiceExceptionReport' in response_sample:
return True

# This appears to be an alternative - I can't find the spec.
# e.g. http://sedsh13.sedsh.gov.uk/ArcGIS/services/HS/Historic_Scotland/MapServer/WFSServer?service=abc
# <ows:ExceptionReport version='1.1.0' language='en' xmlns:ows='http://www.opengis.net/ows'><ows:Exception exceptionCode='NoApplicableCode'><ows:ExceptionText>Wrong service type.</ows:ExceptionText></ows:Exception></ows:ExceptionReport>
if '<ows:ExceptionReport' in response_sample:
return True

1 change: 1 addition & 0 deletions tests/data/wfs_getcap.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<?xml version="1.0" encoding="utf-8" ?><wfs:WFS_Capabilities version='1.1.0' xsi:schemaLocation='http://www.opengis.net/gml http://schemas.opengis.net/gml/3.1.1/base/gml.xsd http://www.opengis.net/ogc http://schemas.opengis.net/filter/1.1.0/filter.xsd http://www.opengis.net/ows http://schemas.opengis.net/ows/1.0.0/owsAll.xsd http://www.opengis.net/wfs http://schemas.opengis.net/wfs/1.1.0/wfs.xsd' xmlns:wfs='http://www.opengis.net/wfs' xmlns:ogc='http://www.opengis.net/ogc' xmlns:gml='http://www.opengis.net/gml' xmlns:ows='http://www.opengis.net/ows' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' xmlns:xlink='http://www.w3.org/1999/xlink' xmlns:HS_Historic_Scotland='http://sedsh13.sedsh.gov.uk/ArcGIS/services/HS/Historic_Scotland/MapServer/WFSServer'><ows:ServiceIdentification><ows:Title>HS_Historic_Scotland</ows:Title><ows:Abstract></ows:Abstract><ows:Keywords><ows:Keyword></ows:Keyword></ows:Keywords><ows:ServiceType>WFS</ows:ServiceType><ows:ServiceTypeVersion>1.1.0</ows:ServiceTypeVersion><ows:Fees></ows:Fees><ows:AccessConstraints></ows:AccessConstraints></ows:ServiceIdentification><ows:ServiceProvider><ows:ProviderName></ows:ProviderName><ows:ServiceContact><ows:IndividualName></ows:IndividualName><ows:PositionName></ows:PositionName><ows:ContactInfo><ows:Phone><ows:Voice></ows:Voice><ows:Facsimile></ows:Facsimile></ows:Phone><ows:Address><ows:DeliveryPoint></ows:DeliveryPoint><ows:City></ows:City><ows:AdministrativeArea></ows:AdministrativeArea><ows:PostalCode></ows:PostalCode><ows:Country></ows:Country><ows:ElectronicMailAddress></ows:ElectronicMailAddress></ows:Address><ows:HoursOfService></ows:HoursOfService><ows:ContactInstructions></ows:ContactInstructions></ows:ContactInfo></ows:ServiceContact></ows:ServiceProvider><ows:OperationsMetadata><ows:Operation name='GetCapabilities'><ows:DCP><ows:HTTP><ows:Get xlink:href='http://sedsh13.sedsh.gov.uk/ArcGIS/services/HS/Historic_Scotland/MapServer/WFSServer?'/><ows:Post xlink:href='http://sedsh13.sedsh.gov.uk/ArcGIS/services/HS/Historic_Scotland/MapServer/WFSServer'/></ows:HTTP></ows:DCP><ows:Parameter name='AcceptVersions'><ows:Value>1.1.0</ows:Value><ows:Value>1.0.0</ows:Value></ows:Parameter><ows:Parameter name='AcceptFormats'><ows:Value>text/xml</ows:Value></ows:Parameter></ows:Operation><ows:Operation name='DescribeFeatureType'><ows:DCP><ows:HTTP><ows:Get xlink:href='http://sedsh13.sedsh.gov.uk/ArcGIS/services/HS/Historic_Scotland/MapServer/WFSServer?'/><ows:Post xlink:href='http://sedsh13.sedsh.gov.uk/ArcGIS/services/HS/Historic_Scotland/MapServer/WFSServer'/></ows:HTTP></ows:DCP><ows:Parameter name='outputFormat'><ows:Value>text/xml; subType=gml/3.1.1/profiles/gmlsf/1.0.0/0</ows:Value></ows:Parameter></ows:Operation><ows:Operation name='GetFeature'><ows:DCP><ows:HTTP><ows:Get xlink:href='http://sedsh13.sedsh.gov.uk/ArcGIS/services/HS/Historic_Scotland/MapServer/WFSServer?'/><ows:Post xlink:href='http://sedsh13.sedsh.gov.uk/ArcGIS/services/HS/Historic_Scotland/MapServer/WFSServer'/></ows:HTTP></ows:DCP><ows:Parameter name='resultType'><ows:Value>results</ows:Value><ows:Value>hits</ows:Value></ows:Parameter><ows:Parameter name='outputFormat'><ows:Value>text/xml; subType=gml/3.1.1/profiles/gmlsf/1.0.0/0</ows:Value></ows:Parameter></ows:Operation><ows:ExtendedCapabilities><ows:Constraint name='serviceAxisOrderForSwappableSRS'><ows:Value>latitude,longitude</ows:Value></ows:Constraint></ows:ExtendedCapabilities></ows:OperationsMetadata><wfs:FeatureTypeList><wfs:FeatureType><wfs:Name>HS_Historic_Scotland:Scheduled_Monuments</wfs:Name><wfs:Title>Scheduled_Monuments</wfs:Title><wfs:DefaultSRS>urn:ogc:def:crs:EPSG:6.9:27700</wfs:DefaultSRS><wfs:OtherSRS>urn:ogc:def:crs:EPSG:6.9:4326</wfs:OtherSRS><wfs:OutputFormats><wfs:Format>text/xml; subType=gml/3.1.1/profiles/gmlsf/1.0.0/0</wfs:Format></wfs:OutputFormats><ows:WGS84BoundingBox><ows:LowerCorner>-9.1822046217826792 54.516771969950476</ows:LowerCorner><ows:UpperCorner>-0.75900744851719537 60.835322872527833</ows:UpperCorner></ows:WGS84BoundingBox></wfs:FeatureType><wfs:FeatureType><wfs:Name>HS_Historic_Scotland:Gardens_and_Designed_Landscapes</wfs:Name><wfs:Title>Gardens_and_Designed_Landscapes</wfs:Title><wfs:DefaultSRS>urn:ogc:def:crs:EPSG:6.9:27700</wfs:DefaultSRS><wfs:OtherSRS>urn:ogc:def:crs:EPSG:6.9:4326</wfs:OtherSRS><wfs:OutputFormats><wfs:Format>text/xml; subType=gml/3.1.1/profiles/gmlsf/1.0.0/0</wfs:Format></wfs:OutputFormats><ows:WGS84BoundingBox><ows:LowerCorner>-7.0336513267162362 54.690856874082087</ows:LowerCorner><ows:UpperCorner>-0.93406280915045325 60.693925610178972</ows:UpperCorner></ows:WGS84BoundingBox></wfs:FeatureType><wfs:FeatureType><wfs:Name>HS_Historic_Scotland:Conservation_Areas</wfs:Name><wfs:Title>Conservation_Areas</wfs:Title><wfs:DefaultSRS>urn:ogc:def:crs:EPSG:6.9:27700</wfs:DefaultSRS><wfs:OtherSRS>urn:ogc:def:crs:EPSG:6.9:4326</wfs:OtherSRS><wfs:OutputFormats><wfs:Format>text/xml; subType=gml/3.1.1/profiles/gmlsf/1.0.0/0</wfs:Format></wfs:OutputFormats><ows:WGS84BoundingBox><ows:LowerCorner>-7.8312651931150103 54.613553898871785</ows:LowerCorner><ows:UpperCorner>-1.1344320093117748 60.159542436118045</ows:UpperCorner></ows:WGS84BoundingBox></wfs:FeatureType><wfs:FeatureType><wfs:Name>HS_Historic_Scotland:Listed_Buildings</wfs:Name><wfs:Title>Listed_Buildings</wfs:Title><wfs:DefaultSRS>urn:ogc:def:crs:EPSG:6.9:27700</wfs:DefaultSRS><wfs:OtherSRS>urn:ogc:def:crs:EPSG:6.9:4326</wfs:OtherSRS><wfs:OutputFormats><wfs:Format>text/xml; subType=gml/3.1.1/profiles/gmlsf/1.0.0/0</wfs:Format></wfs:OutputFormats><ows:WGS84BoundingBox><ows:LowerCorner>-8.3315203615463762 54.55075161215354</ows:LowerCorner><ows:UpperCorner>-0.71102800004906719 60.859890744753315</ows:UpperCorner></ows:WGS84BoundingBox></wfs:FeatureType><wfs:FeatureType><wfs:Name>HS_Historic_Scotland:Battlefield_Inventory</wfs:Name><wfs:Title>Battlefield_Inventory</wfs:Title><wfs:DefaultSRS>urn:ogc:def:crs:EPSG:6.9:27700</wfs:DefaultSRS><wfs:OtherSRS>urn:ogc:def:crs:EPSG:6.9:4326</wfs:OtherSRS><wfs:OutputFormats><wfs:Format>text/xml; subType=gml/3.1.1/profiles/gmlsf/1.0.0/0</wfs:Format></wfs:OutputFormats><ows:WGS84BoundingBox><ows:LowerCorner>-5.4080431504854083 55.475802770916687</ows:LowerCorner><ows:UpperCorner>-2.3029657578915508 57.948874132478359</ows:UpperCorner></ows:WGS84BoundingBox></wfs:FeatureType><wfs:FeatureType><wfs:Name>HS_Historic_Scotland:Historic_Marine_Protected_Areas</wfs:Name><wfs:Title>Historic_Marine_Protected_Areas</wfs:Title><wfs:DefaultSRS>urn:ogc:def:crs:EPSG:6.9:4326</wfs:DefaultSRS><wfs:OutputFormats><wfs:Format>text/xml; subType=gml/3.1.1/profiles/gmlsf/1.0.0/0</wfs:Format></wfs:OutputFormats><ows:WGS84BoundingBox><ows:LowerCorner>-6.0776300198372644 56.038719476068309</ows:LowerCorner><ows:UpperCorner>-0.72131791373474718 60.425347509141559</ows:UpperCorner></ows:WGS84BoundingBox></wfs:FeatureType><wfs:FeatureType><wfs:Name>HS_Historic_Scotland:World_Heritage_Sites</wfs:Name><wfs:Title>World_Heritage_Sites</wfs:Title><wfs:DefaultSRS>urn:ogc:def:crs:EPSG:6.9:27700</wfs:DefaultSRS><wfs:OtherSRS>urn:ogc:def:crs:EPSG:6.9:4326</wfs:OtherSRS><wfs:OutputFormats><wfs:Format>text/xml; subType=gml/3.1.1/profiles/gmlsf/1.0.0/0</wfs:Format></wfs:OutputFormats><ows:WGS84BoundingBox><ows:LowerCorner>-8.9339253214762397 55.491284933123268</ows:LowerCorner><ows:UpperCorner>-2.9122016664017822 59.155902536243687</ows:UpperCorner></ows:WGS84BoundingBox></wfs:FeatureType></wfs:FeatureTypeList><ogc:Filter_Capabilities><ogc:Spatial_Capabilities><ogc:GeometryOperands><ogc:GeometryOperand>gml:Envelope</ogc:GeometryOperand><ogc:GeometryOperand>gml:Point</ogc:GeometryOperand><ogc:GeometryOperand>gml:Polygon</ogc:GeometryOperand><ogc:GeometryOperand>gml:LineString</ogc:GeometryOperand></ogc:GeometryOperands><ogc:SpatialOperators><ogc:SpatialOperator name='BBOX'/><ogc:SpatialOperator name='Equals'/><ogc:SpatialOperator name='Disjoint'/><ogc:SpatialOperator name='Intersects'/><ogc:SpatialOperator name='Crosses'/><ogc:SpatialOperator name='Touches'/><ogc:SpatialOperator name='Within'/><ogc:SpatialOperator name='Contains'/><ogc:SpatialOperator name='Overlaps'/></ogc:SpatialOperators></ogc:Spatial_Capabilities><ogc:Scalar_Capabilities><ogc:LogicalOperators/><ogc:ComparisonOperators><ogc:ComparisonOperator>EqualTo</ogc:ComparisonOperator><ogc:ComparisonOperator>NotEqualTo</ogc:ComparisonOperator><ogc:ComparisonOperator>LessThan</ogc:ComparisonOperator><ogc:ComparisonOperator>GreaterThan</ogc:ComparisonOperator><ogc:ComparisonOperator>LessThanEqualTo</ogc:ComparisonOperator><ogc:ComparisonOperator>GreaterThanEqualTo</ogc:ComparisonOperator><ogc:ComparisonOperator>Like</ogc:ComparisonOperator><ogc:ComparisonOperator>Between</ogc:ComparisonOperator><ogc:ComparisonOperator>NullCheck</ogc:ComparisonOperator></ogc:ComparisonOperators></ogc:Scalar_Capabilities><ogc:Id_Capabilities><ogc:EID/><ogc:FID/></ogc:Id_Capabilities></ogc:Filter_Capabilities></wfs:WFS_Capabilities>
Loading

0 comments on commit d067f41

Please sign in to comment.