forked from EleutherAI/the-pile
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
78 lines (58 loc) · 1.79 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import hashlib
from concurrent_iterator.thread import Producer
from functools import reduce
import operator
class ExitCodeError(Exception): pass
def id(x):
return x
def utf8len(s):
return len(s.encode('utf-8'))
def sh(x):
if os.system(x): raise ExitCodeError()
def fwrite(fname, content):
with open(fname, 'w') as fh:
fh.write(content)
def fread(fname):
with open(fname) as fh:
return fh.read()
def ls(x):
return [x + '/' + fn for fn in os.listdir(x)]
def cycle_documents(dataset):
while True:
yield from Producer(filter(id, dataset.documents()), 1000)
def concat(xs):
for x in xs:
yield from x
def flatMap(f, x):
return reduce(operator.add, map(f, x), [])
def sha256sum(filename, expected=None):
h = hashlib.sha256()
b = bytearray(128*1024)
mv = memoryview(b)
with open(filename, 'rb', buffering=0) as f:
for n in iter(lambda : f.readinto(mv), 0):
h.update(mv[:n])
if expected:
assert h.hexdigest() == expected
print('CHECKSUM OK', filename)
else:
print(filename, h.hexdigest())
# https://stackoverflow.com/questions/12523586/python-format-size-application-converting-b-to-kb-mb-gb-tb/37423778
def humanbytes(B):
'Return the given bytes as a human friendly KB, MB, GB, or TB string'
B = float(B)
KB = float(1024)
MB = float(KB ** 2) # 1,048,576
GB = float(KB ** 3) # 1,073,741,824
TB = float(KB ** 4) # 1,099,511,627,776
if B < KB:
return '{0} {1}'.format(B,'Bytes' if 0 == B > 1 else 'Byte')
elif KB <= B < MB:
return '{0:.2f} KiB'.format(B/KB)
elif MB <= B < GB:
return '{0:.2f} MiB'.format(B/MB)
elif GB <= B < TB:
return '{0:.2f} GiB'.format(B/GB)
elif TB <= B:
return '{0:.2f} TiB'.format(B/TB)