forked from AndyJPhillips/ONT-HG1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsigned-url-requestor.py
executable file
·477 lines (416 loc) · 18 KB
/
signed-url-requestor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
#!/usr/bin/env python
"""
Generates signed URLs & makes corresponding requests to Requestor Pays buckets.
Compulsory arguments:
url the url of the file to download from S3
Your AWS API keys, by supplying either:
--access-key your API access key
--secret-key your API secret key
or:
--aws-creds-file aws credentials file, formatted like a boto creds file
will default to ~/.boto
Optional arguments:
--profile the section within your aws credentails file to use
--output-dir the directory to save the downloaded file in
--index-file if set, script assumes you're download an index file
which should be parsed for urls of files to download
-v verbose output for debugging
"""
import ConfigParser
import argparse
import os
import sys
from datetime import datetime
from pytz import timezone
import re
import base64
import hmac
import sha
import urllib2
def parse_args(argv):
"""Return the parsed arguments passed on the CLI."""
# parse CLI arguments
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=("Signed URL generator for Requestor Pays S3 buckets\n"
"minimum requirements:\n"
" - s3 file url\n"
" - aws credentials "
"(as CLI arguments or in a credentials file)"))
# read arguments passed to the script
parser.add_argument('url',
help="url of the file to download from S3")
parser.add_argument('--access-key',
help="your aws_access_key_id",
dest='access_key')
parser.add_argument('--secret-key',
help="your aws_secret_access_key",
dest='secret_key')
parser.add_argument('--aws-creds-file',
help=("path to aws credentials file, "
"defaults to ~/.boto"),
nargs='?',
default='~/.boto',
dest='aws_creds_file')
parser.add_argument('--profile',
help=("profile/section within the "
"aws credentials file"),
dest='profile')
parser.add_argument('--output-dir',
help=("the directory to save the requested file in, "
"defaults to current directory"),
nargs='?',
default='./',
dest='output_dir')
parser.add_argument('--index-file',
action='count',
help=("if set, the script assumes the url represents "
"an index file. "
"The file downloaded will be read "
"to find additional urls to download"),
dest='index_file')
parser.add_argument('-v', '--verbose',
action='count',
help="verbose output",
dest='verbose')
args = parser.parse_args()
return args
def parse_creds_file(path, taglist, profile):
"""Parse aws credentials file."""
print "No API keys passed as CLI arguments, so checking credentials file"
path = os.path.expanduser(path)
if verbosity >= 1:
print "Expanded path to aws credentials file: {}".format(path)
if (os.path.isfile(path) is not True):
print"AWS credentials file does not exist: {}".format(path)
return False
config = ConfigParser.SafeConfigParser()
try:
config.read(path)
except:
print"Cannot find aws credentials file: {}".format(path)
return False
if not profile:
# we will look for a default/credentials profile
if verbosity >= 1:
print ("No profile specified on the CLI, "
"looking for a default/credentials profile ")
if config.has_section("default"):
profile = "default"
elif config.has_section("Default"):
profile = "Default"
elif config.has_section("DEFAULT"):
profile = "DEFAULT"
elif config.has_section("credentials"):
profile = "credentials"
elif config.has_section("Credentials"):
profile = "Credentials"
elif config.has_section("CREDENTIALS"):
profile = "CREDENTIALS"
else:
print("Cannot find a default or credentials profile "
"in your aws credentials file: {}".format(path))
return False
try:
config.options(profile)
if verbosity >= 1:
print ("Found the {} profile "
"in your aws credentails file: {}".format(profile, path))
except:
print("Cannot find a section in the aws credentials file"
"called: {}".format(profile))
return False
params = dict()
for tag in taglist:
if tag in config.options(profile):
params.update({tag: config.get(profile, tag)})
else:
params.update({tag: None})
return params
def dissect_s3url(url):
"""Return the s3 bucket & prefix."""
# use regex to pull out the bucket & prefix
re_start_s3 = re.compile('^s3://')
re_start_http = re.compile('^https?://')
re_s3 = re.compile(
'^s3://([^/]+)(/.*?)$')
re_http1 = re.compile(
'^https?://([^.]+)[.]s3[.]amazonaws[.]com(/.*?)$')
re_http2 = re.compile(
'^https?://([^.]+)[.]s3-website-[^.]+[.]amazonaws.com(/.*?)$')
re_http3 = re.compile(
'^https?://s3[^.]+[.]amazonaws.com/([^/]+)(/.*?)$')
if re_start_s3.match(url):
s3_url = re_s3.match(url)
(bucket, prefix) = (s3_url.group(1), s3_url.group(2))
return (bucket, prefix)
elif re_start_http.match(url):
http_url_1 = re_http1.match(url)
http_url_2 = re_http2.match(url)
http_url_3 = re_http3.match(url)
if http_url_1:
(bucket, prefix) = (http_url_1.group(1), http_url_1.group(2))
return (bucket, prefix)
elif http_url_2:
(bucket, prefix) = (http_url_2.group(1), http_url_2.group(2))
return (bucket, prefix)
elif http_url_3:
(bucket, prefix) = (http_url_3.group(1), http_url_3.group(2))
return (bucket, prefix)
else:
return False
else:
return False
def get_datetime():
"""Return the date in standard format, inc timezone."""
gmt = timezone('GMT')
dtnow = datetime.now()
dtnow_loc = gmt.localize(dtnow)
date = dtnow_loc.strftime("%a, %d %b %Y %X %Z")
return date
def make_signature(bucket, prefix, aws_secret_key):
"""Return the bucket, prefix and signature."""
# get the datetime
date = get_datetime()
string_to_sign = ("GET\n" +
"\n" +
"\n" +
date + "\n"
"x-amz-request-payer:requester\n" +
"/" + bucket + prefix)
if verbosity >= 1:
print "string_to_sign set to:\n{}".format(string_to_sign)
h = hmac.new(aws_secret_key, string_to_sign, sha)
signature = base64.encodestring(h.digest().strip())
if verbosity >= 1:
print "\nSignature calculated is: {}".format(signature.strip())
return (bucket, prefix, signature)
def make_request(bucket, prefix, signature, aws_access_key):
"""Make the signed request, return the S3 object requested."""
# construct the request for the signed url
# including all necessary headers for a Requester Pays bucket
if verbosity >= 1:
print "Determined the bucket is: {}".format(bucket)
print "Determined the prefix is: {}".format(prefix)
# use urllib2 to build the request
# first define the headers
auth_header = "AWS " + aws_access_key + ":" + signature
auth_header = auth_header.strip()
if verbosity >= 1:
print "Value of Authorization header is: {}".format(auth_header)
date_header = get_datetime()
headers = {"Authorization": auth_header,
"Date": date_header,
"x-amz-request-payer": "requester"}
# construct the request
url = "https://" + bucket + ".s3.amazonaws.com" + prefix
req = urllib2.Request(url, None, headers)
print "Created the signed URL request"
if verbosity >= 1:
print "\nThe signed URL request is:"
print req.get_method(), req.get_full_url()
print "with the HTTP headers set to:"
for header in req.header_items():
print header
print "Executing the signed URL request"
try:
opener = urllib2.build_opener()
response = opener.open(req, None, 10)
return response
except urllib2.HTTPError as error:
print "Error encountered attempting to download S3 object:"
print "Got HTTP response: {} {}".format(error.code, error.msg)
if verbosity >= 1:
print "\nResponse HTTP headers:\n{}".format(error.info())
print "\nResponse body:\n{}".format(error.read())
return False
except urllib2.URLError as url_error:
print ("Error encountered attempting to download "
"S3 object:{}".format(url_error))
return False
def download_file(bucket, prefix, aws_access_key, aws_secret_key, directory):
"""Sign & make request, save file returned."""
# create the signature
print ("\n========================\n"
"Creating the signature to use in the request")
try:
(bucket, prefix, signature) = make_signature(bucket,
prefix,
aws_secret_key)
except TypeError:
print "Unable to complete signature creation"
return False
# now make the signed URL request
response = make_request(bucket, prefix, signature, aws_access_key)
if response is False:
return False
if verbosity >= 1:
print "\nResponse summary:"
print "request url:{}".format(response.url)
print "status code:{}".format(response.code)
print "msg:{}".format(response.msg)
print "\nResponse HTTP headers:\n{}".format(response.info())
print "\nDownloading: {}".format(response.url)
# pick out the filename from the s3 prefix
re_file = re.compile('.*/(.*?)$')
filename = re_file.match(prefix).group(1)
# ensure the output_dir ends with /
output_dir = directory
output_dir = os.path.abspath(output_dir)
re_path = re.compile('.*/$')
path = re_path.match(output_dir)
if not path:
output_dir = output_dir + "/"
filepath = output_dir + filename
# then write response object out to the filesystem
try:
with open(filepath, 'w') as output_file:
output_file.write(response.read())
return filepath
except IOError as error:
print ("Error writing s3 object to file: {}\n"
"It's likely that the specified directory {} "
"doesn't exist".format(filepath, output_dir))
if verbosity >= 1:
print "The full error is {}".format(error)
return False
def main(argv):
"""Main function for the script, which calls everything else."""
# use the AWS creds / conf file & S3 URL to construct the signed URL
# then make the request
# will use the Authorization header method to create the signed URL
# make it clear that the user will pay for requests & downloads
print ("You are about to download files from a "
"'Requestor Pays' S3 bucket\n"
"This means you will be charged for each request & download\n"
"Further details of these costs can be found here:\n"
"https://aws.amazon.com/s3/pricing/\n")
resp = raw_input("Press any key to continue...\n")
# parse CLI args passed to the script
args = parse_args(argv)
# set verbosity global variable so it can be used in all functions
global verbosity
verbosity = args.verbose
# If aws keys aren't specified on the cli, read them from creds file
key_tags = ['aws_access_key_id', 'aws_secret_access_key']
if args.access_key is None or args.secret_key is None:
try:
params = parse_creds_file(args.aws_creds_file,
key_tags,
args.profile)
args.access_key = params[key_tags[0]]
args.secret_key = params[key_tags[1]]
except TypeError:
print ("Error encountered whilst attempting to "
"parse your AWS credentials file")
return False
if args.access_key is None or args.secret_key is None:
print("Unable to get AWS API keys, "
"either specify them on the CLI "
"(--access-key, --secret-key), "
"or in a creds file (--aws-creds-file) "
"or ensure they are in ~/.boto")
return False
print "AWS API keys found"
if verbosity >= 1:
print "aws_access_key_id is {}".format(args.access_key)
print "aws_secret_access_key is {}".format(args.secret_key)
print "URL provided: {}".format(args.url)
try:
(bucket, prefix) = dissect_s3url(args.url)
except TypeError:
print (
"URL not a supported S3 URL, "
"please provide the S3 URL in one of these formats:\n"
"s3://<bucket><prefix>\n"
"http(s)://<bucket>.s3.amazonaws.com<prefix>\n"
"http(s)://<bucket>.s3-website-eu-west-1.amazonaws.com<prefix>\n"
"http(s)://s3-eu-west-1.amazonaws.com/<bucket><prefix>\n"
"NB: <prefix> always starts with a /")
filepath = download_file(bucket,
prefix,
args.access_key,
args.secret_key,
args.output_dir)
if filepath is False:
print "Error downloading & saving file, exiting"
return False
else:
print ("Saved retrieved S3 object to: {}"
"\n========================\n".format(filepath))
# process index files
# downloading all files within the index file
if args.index_file:
re_index = re.compile('.*index.txt$')
index_prefix = re_index.match(prefix)
if index_prefix:
print "You provided a correctly named index file"
# read the first line of the file, make sure it's an s3 url
with open(filepath, 'r') as index_file:
start_pos = index_file.tell()
first_line = index_file.readline()
try:
(bucket, prefix) = dissect_s3url(first_line)
print ("At an initial glance, index file looks good")
except TypeError:
print ("First line of the index file is not an S3 URL\n"
"Please ensure the index file only contains "
" S3 URLs, "
"one per line")
return False
# rewind the file to the start
index_file.seek(start_pos)
# count the no. of lines/urls in the file
for index, line in enumerate(index_file):
pass
line_count = index + 1
# try to download all files listed in the file
# after confirming the user really wants to
resp = raw_input("You are about to download all files listed "
"in {}\nWith one S3 URL per line, this "
"represents {} files\n"
"Are you sure you want to proceed? "
"(Y/N): ".format(args.url, line_count))
while True:
resp = resp.lower()
if resp == "y":
# rewind the file to the start again
index_file.seek(start_pos)
for line in index_file:
# Pull the bucket and prefix out of the S3 URL
(bucket, prefix) = dissect_s3url(line)
# sign & make request, download the file
filepath = download_file(bucket,
prefix,
args.access_key,
args.secret_key,
args.output_dir)
if filepath is False:
print ("Error downloading & saving file, "
"exiting")
return False
else:
print ("Saved retrieved S3 object "
"to: {}".format(filepath))
print "========================\n"
break
elif resp == "n":
print ("OK exiting without downloading all files "
"listed in {}".format(args.url))
return False
break
else:
resp = raw_input("Invalid response, "
"please respond with Y/N: ")
else:
print ("Sorry the URL you provided {}"
" does not represent a correctly named index file.\n"
"Please ensure the file is named *.index.txt\n"
"and only contains S3 URLs of files to download, "
"one per line".format(args.url))
return False
if __name__ == "__main__":
try:
main(sys.argv[1:])
except KeyboardInterrupt:
print "\nYou pressed Ctrl+C so I quit"