-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathmem_util.py
190 lines (154 loc) · 6.03 KB
/
mem_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# Utilities to figure out memory usage of run call
#
# Usage:
# import mem_util
# run_metadata = tf.RunMetadata()
# options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
# sess.run(tensor, options=options, run_metadata=run_metadata)
# print(mem_util.peak_memory(run_metadata))
#
# To print memory usage for particular device:
# print(mem_util.peak_memory(run_metadata)["/gpu:0"])
# Developer notes:
# RunMetadata
# https://github.com/tensorflow/tensorflow/blob/fc49f43817e363e50df3ff2fd7a4870ace13ea13/tensorflow/core/protobuf/config.proto#L349
#
# StepStats (run_metadata.step_stats)
# https://github.com/tensorflow/tensorflow/blob/a2d9b3bf5f9e96bf459074d079b01e1c74b25afa/tensorflow/core/framework/step_stats.proto
#
# NodeExecStats (run_metadata.step_stats.dev_stats[0].step_stats[0])
# https://github.com/tensorflow/tensorflow/blob/a2d9b3bf5f9e96bf459074d079b01e1c74b25afa/tensorflow/core/framework/step_stats.proto#L52
# Note, there are several methods of tracking memory allocation. There's
# requested bytes, and allocated bytes. Allocator may choose to give more bytes
# than is requested. Currently allocated_bytes is used to give more realistic
# results
#
# allocation_description {
# requested_bytes: 1000000
# allocated_bytes: 1000192
# allocator_name: "GPU_0_bfc"
#
#
# There's also additional field in NodeExecStats which tracks allocator state
# node_stats {
# node_name: "Bn_1/value"
# all_start_micros: 1512081861177033
# op_start_rel_micros: 1
# op_end_rel_micros: 3
# all_end_rel_micros: 5
# memory {
# allocator_name: "GPU_0_bfc"
# allocator_bytes_in_use: 3072
# }
#
# Additionally one could use LOG_MEMORY messages to get memory allocation info
# See multiple_memory_obtain_example.py for details on using these additional
# methods
def peak_memory(run_metadata):
"""Return dictionary of peak memory usage (bytes) for each device.
{"cpu:0": 20441, ...
"""
assert run_metadata != None
assert hasattr(run_metadata, "step_stats")
assert hasattr(run_metadata.step_stats, "dev_stats")
dev_stats = run_metadata.step_stats.dev_stats
result = {}
for dev_stat in dev_stats:
device_name = _simplify_device_name(dev_stat.device)
result[device_name] = _peak_from_nodestats(dev_stat.node_stats)
return result
def _timeline_from_nodestats(nodestats):
"""Return sorted memory allocation records from list of nodestats
[NodeExecStats, NodeExecStats...], it's the
run_metadata.step_stats.dev_stats[0].step_stats object.
Timeline looks like this:
timestamp nodename mem delta, allocator name
[1509481813012895, 'concat', 1000496, 'cpu'],
[1509481813012961, 'a04', -1000000, 'cpu'],
[1509481813012968, 'TanhGrad', 0, 'cpu'],
0 memory allocation is reported for nodes without allocation_records
"""
lines = []
if not nodestats:
return []
for node in nodestats:
for mem in node.memory: # can have both cpu and gpu allocator for op
try:
records = mem.allocation_records
except:
records = []
allocator = mem.allocator_name
if len(records)>0:
for record in records:
line = [record.alloc_micros, node.node_name, record.alloc_bytes,
allocator]
lines.append(line)
else:
output_bytes = -1
try:
output_bytes = node.output[0].tensor_description.allocation_description.requested_bytes
except:
pass
line = [node.all_start_micros, node.node_name, 0, "unknown"]
lines.append(line)
def first_key(x): return x[0]
return sorted(lines, key=first_key)
# todo: get rid of "timeline_from_nodestats"
def _position_of_largest(my_list):
"""Return index of largest entry """
import operator
index, value = max(enumerate(my_list), key=operator.itemgetter(1))
return index
def _peak_from_nodestats(nodestats):
"""Given a list of NodeExecStats messages, construct memory timeline."""
timeline = _timeline_from_nodestats(nodestats)
timestamps = []
data = []
total_memory = 0
peak_memory = total_memory
for record in timeline:
timestamp, kernel_name, allocated_bytes, allocator_type = record
allocated_bytes = int(allocated_bytes)
total_memory += allocated_bytes
peak_memory = max(total_memory, peak_memory)
return peak_memory
def _print_parsed_timeline(timeline, gpu_only=False, ignore_less_than_bytes=0):
"""pretty print parsed memory timeline."""
total_memory = 0
timestamps = []
data = []
first_timestamp = timeline[0][0]
for record in timeline:
timestamp, kernel_name, allocated_bytes, allocator_type = record
allocated_bytes = int(allocated_bytes)
if abs(allocated_bytes)<ignore_less_than_bytes:
continue # ignore small allocations
total_memory += allocated_bytes
print("%6d %10d %10d %s"%(timestamp-first_timestamp, total_memory,
allocated_bytes, kernel_name))
def _simplify_device_name(device):
"""/job:localhost/replica:0/task:0/device:CPU:0 -> /cpu:0"""
prefix = '/job:localhost/replica:0/task:0/device:'
if device.startswith(prefix):
device = '/'+device[len(prefix):]
return device.lower()
def _device_stats_dict(run_metadata):
"""Returns dictionary of device_name->[NodeExecStats, NodeExecStats...]"""
result = {}
for dev_stat in run_metadata.step_stats.dev_stats:
device_name = _simplify_device_name(dev_stat.device)
result[device_name] = dev_stat.node_stats
return result
def print_memory_timeline(run_metadata, device=None):
"""Human readable timeline of memory allocation/deallocation for given
device. If device is None, prints timeline of the device with highest memory
usage"""
if device is None:
peak_dict = peak_memory(run_metadata)
peak_pairs = list(peak_dict.items())
chosen_peak = _position_of_largest([peak for (dev, peak) in peak_pairs])
device = peak_pairs[chosen_peak][0]
device_metadata = _device_stats_dict(run_metadata)
print("Printing timeline for "+device)
parsed_timeline = _timeline_from_nodestats(device_metadata[device])
_print_parsed_timeline(parsed_timeline)