Skip to content

Commit

Permalink
Refactored warcio's loop string/byte concatenation heavy sections to …
Browse files Browse the repository at this point in the history
…be gain performance increases (https://docs.python.org/3/faq/programming.html#id37)

Changed the caught exception in try_brotli_init from ImportError to Exception due to finding causing pywb PR #444 (webrecorder/pywb#444)
  • Loading branch information
N0taN3rd committed Feb 20, 2019
1 parent 7f533c8 commit 8fb6994
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 33 deletions.
28 changes: 15 additions & 13 deletions warcio/bufferedreaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def brotli_decompressor():
return decomp

BufferedReader.DECOMPRESSORS['br'] = brotli_decompressor
except ImportError: #pragma: no cover
except Exception: # pragma: no cover
pass


Expand Down Expand Up @@ -157,20 +157,19 @@ def read(self, length=None):
if at buffer boundary, will attempt to read again until
specified length is read
"""
all_buffs = []
# bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost
all_buffs = bytearray()
while length is None or length > 0:
self._fillbuff()
if self.empty():
break

buff = self.buff.read(length)
all_buffs.append(buff)
all_buffs += buff
if length:
length -= len(buff)

return b''.join(all_buffs)


return bytes(all_buffs)

def readline(self, length=None):
"""
Expand All @@ -187,10 +186,12 @@ def readline(self, length=None):
if self.empty():
return b''

linebuff = self.buff.readline(length)
# bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost
linebuff = bytearray(self.buff.readline(length))
newline_b = b'\n'

# we may be at a boundary
while not linebuff.endswith(b'\n'):
while not linebuff.endswith(newline_b):
if length:
length -= len(linebuff)
if length <= 0:
Expand All @@ -203,7 +204,7 @@ def readline(self, length=None):

linebuff += self.buff.readline(length)

return linebuff
return bytes(linebuff)

def empty(self):
if not self.buff or self.buff.tell() >= self.buff_size:
Expand Down Expand Up @@ -336,7 +337,8 @@ def _try_decode(self, length_header):
return

data_len = 0
data = b''
# bytes are immutable, in-place concatenation via bytearray avoids the quadratic runtime cost
data = bytearray()

# read chunk
while data_len < chunk_size:
Expand All @@ -348,7 +350,7 @@ def _try_decode(self, length_header):
if not new_data:
if self.raise_chunked_data_exceptions:
msg = 'Ran out of data before end of chunk'
raise ChunkedDataException(msg, data)
raise ChunkedDataException(msg, bytes(data))
else:
chunk_size = data_len
self.all_chunks_read = True
Expand All @@ -362,10 +364,10 @@ def _try_decode(self, length_header):
clrf = self.stream.read(2)
if clrf != b'\r\n':
raise ChunkedDataException(b"Chunk terminator not found.",
data)
bytes(data))

# hand to base class for further processing
self._process_read(data)
self._process_read(bytes(data))


#=================================================================
Expand Down
10 changes: 6 additions & 4 deletions warcio/capture_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,14 @@ def _extract_url(self, data):
path = line.split(' ', 2)[1]

scheme = 'https' if self.default_port == 443 else 'http'
url = scheme + '://' + self.host
# string are immutable, in-place concatenation via list avoids the quadratic runtime cost
url = [scheme, '://', self.host]
if self.port != self.default_port:
url += ':' + str(self.port)
url.append(':')
url.append(str(self.port))

url += path
return url
url.append(path)
return ''.join(url)


# ============================================================================
Expand Down
43 changes: 27 additions & 16 deletions warcio/statusandheaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,26 +134,28 @@ def __bool__(self):
__nonzero__ = __bool__

def to_str(self, filter_func=None):
string = self.protocol
# strings are immutable, in-place concatenation via list avoids the quadratic runtime cost
crlf = '\r\n'
string = [self.protocol]

if string and self.statusline:
string += ' '
if self.protocol and self.statusline:
string.append(' ')

if self.statusline:
string += self.statusline
string.append(self.statusline)

if string:
string += '\r\n'
if self.protocol or self.statusline:
string.append(crlf)

for h in self.headers:
if filter_func:
h = filter_func(h)
if not h:
continue
string.append(': '.join(h))
string.append(crlf)

string += ': '.join(h) + '\r\n'

return string
return ''.join(string)

def to_bytes(self, filter_func=None, encoding='utf-8'):
return self.to_str(filter_func).encode(encoding) + b'\r\n'
Expand Down Expand Up @@ -247,6 +249,14 @@ def parse(self, stream, full_statusline=None):
protocol='',
total_len=total_read)

# strings and tuples are immutable, create these objects before the loop
# in order to only create them once per parse invocation
spacestr = ' '
tabstr = '\t'
strip_space_tab = spacestr + tabstr
colonstr = ':'
split_on_space_or_tab = (spacestr, tabstr)

# validate only if verify is set
if self.verify:
protocol_status = self.split_prefix(statusline, self.statuslist)
Expand All @@ -256,14 +266,15 @@ def parse(self, stream, full_statusline=None):
msg = msg.format(self.statuslist, statusline)
raise StatusAndHeadersParserException(msg, full_statusline)
else:
protocol_status = statusline.split(' ', 1)
protocol_status = statusline.split(spacestr, 1)

line, total_read = _strip_count(self.decode_header(stream.readline()), total_read)
while line:
result = line.split(':', 1)
result = line.split(colonstr, 1)
if len(result) == 2:
name = result[0].rstrip(' \t')
value = result[1].lstrip()
name = result[0].rstrip(strip_space_tab)
# string are immutable, in-place concatenation via list avoids the quadratic runtime cost
value = [result[1].lstrip()]
else:
name = result[0]
value = None
Expand All @@ -272,14 +283,14 @@ def parse(self, stream, full_statusline=None):
total_read)

# append continuation lines, if any
while next_line and next_line.startswith((' ', '\t')):
while next_line and next_line.startswith(split_on_space_or_tab):
if value is not None:
value += next_line
value.append(next_line)
next_line, total_read = _strip_count(self.decode_header(stream.readline()),
total_read)

if value is not None:
header = (name, value)
header = (name, ''.join(value))
headers.append(header)

line = next_line
Expand Down

0 comments on commit 8fb6994

Please sign in to comment.