-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmlp_parser.py
132 lines (100 loc) · 3.3 KB
/
mlp_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
'''
Parses a text MLPerf log into a structured format.
'''
from __future__ import print_function
import collections
import json
import re
import sys
# For example,
# :::MLPv0.5.0 ncf 1538678136 (./convert.py:119) preproc_hp_num_eval: 999
# :::MLPv0.5.0 ncf 1538678136 (./convert.py:119) run_start
LogLine = collections.namedtuple('LogLine', [
'full_string', # the complete line as a string
'token', # the token, i.e. ':::MLP'
'version_str', # the version string, e.g. 'v0.5.0'
'benchmark', # the benchmark, e.g. 'ncf'
'timestamp', # seconds as a float, e.g. 1234.567
'filename', # the which generated the log line, e.g. './convert.py'
'lineno', # the line in the file which generated the log line, e.g. 119
'tag', # the string tag
'value', # the parsed value associated with the tag, or None if no value
])
TOKEN = ':::MLP'
# ^.*
LINE_PATTERN = '''
^
(:::MLP)(v[\d]+\.[\d+]\.[\d+]) [ ] # token and version
([a-z]+) [ ] # benchmark
([\d\.]+) [ ] # timestamp
\(([^: ]+):(\d+)\) [ ] # file and lineno
([A-Za-z0-9_]+) [ ]? # tag
(:\s+(.+))? # optional value
$
'''
LINE_REGEX = re.compile(LINE_PATTERN, re.X)
def string_to_logline(string):
''' Returns a LogLine or raises a ValueError '''
m = LINE_REGEX.match(string)
if m is None:
raise ValueError('does not match regex')
args = []
args.append('') # full string
args.append(m.group(1)) # token
args.append(m.group(2)) # version
args.append(m.group(3)) # benchmark
ts = float(m.group(4)) # may raise error, e.g. "1.2.3"
# TODO check for weird values
args.append(ts)
args.append(m.group(5)) # file name
lineno = int(m.group(6)) # may raise error
# TODO check for weird values
args.append(lineno)
args.append(m.group(7)) # tag
if m.group(9) is not None:
j = json.loads(m.group(9))
args.append(j)
else:
# No Value
args.append(None)
return LogLine(*args)
def parse_file(filename):
''' Reads a file by name and returns list of loglines and list of errors'''
with open(filename) as f:
return parse_generator(f)
def strip_and_dedup(gen):
lines = []
for l in gen:
if ':::MLP' not in l:
continue
lines.append(re.sub(".*:::MLP", ":::MLP", l))
return list(sorted(list(set(lines))))
def parse_generator(gen):
''' Reads a generator of lines and returns (loglines, errors)
The list of errors are any parsing issues as a tuple (str_line, error_msg)
'''
loglines = []
failed = []
for line in strip_and_dedup(gen):
line = line.strip()
if TOKEN not in line:
continue
try:
ll = string_to_logline(line)
loglines.append(ll)
except ValueError as e:
failed.append((line, str(e)))
return loglines, failed
if __name__ == '__main__':
if len(sys.argv) != 2:
print('usage: mlp_parser.py FILENAME')
print(' tests parsing on the file.')
sys.exit(1)
filename = sys.argv[1]
lines, errors = parse_file(filename)
print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors)))
if len(errors) > 0:
print('Lines which failed to parse:')
for line, error in errors:
print(' Following line failed: {}'.format(error))
print(line)