-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlog_processing.py
executable file
·135 lines (108 loc) · 4.6 KB
/
log_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
#
# Process log files from the BES. jhrg 12/06/24
import csv
import argparse
import os
from datetime import datetime
from collections import defaultdict
verbose = False
def split_csv_by_pid(input_file, field=2):
"""
Spint the given CSV file into N files, one for each of the N PIDs in field 3
of the CSV input file.
:param input_file: CSV file of BES log data
:param field: zero-based index of the field that holds the PID (this changed
from 1 to 2 when instance IDs were addedd to the bes.log)
:return: None; makes N files as a side effect
"""
# Dictionary to store lines grouped by process ID
pid_groups = defaultdict(list)
# Read the input CSV file and group lines by PID
with open(input_file, mode='r') as infile:
reader = csv.reader(infile)
for line in reader:
if line: # Skip empty lines
pid = line[field] # Assuming the PID is in the second column (index 2)
pid_groups[pid].append(line)
# Write output files for each unique PID
base_filename = os.path.splitext(os.path.basename(input_file))[0]
for pid, lines in pid_groups.items():
output_filename = f"{base_filename}_pid_{pid}.csv"
with open(output_filename, mode='w', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerows(lines)
print(f"Written {output_filename}")
def transform_logs_to_csv(input_file, output_file):
"""
Transforms log lines from a custom format to CSV, converting Unix timestamps to ISO time strings
and retaining microsecond values in 'timing' lines.
Parameters:
- input_file: Path to the input file containing log lines.
- output_file: Path to the output CSV file.
"""
def convert_to_iso(timestamp):
"""Converts Unix time in seconds to ISO 8601 format."""
return datetime.utcfromtimestamp(int(timestamp)).isoformat() + "Z"
line_count = 0
info_count = 0
timing_count = 0
error_count = 0
request_count = 0
unknown_count = 0
with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
reader = infile.readlines()
writer = csv.writer(outfile)
for line in reader:
line_count += 1
# Split the log line by '|&|'
fields = line.strip().split('|&|')
if fields[2] == "info":
info_count += 1
elif fields[2] == "timing":
timing_count += 1
elif fields[2] == "request":
request_count += 1
elif fields[2] == "error":
error_count += 1
else:
unknown_count += 1
# Convert the first field (assumed to be Unix time) to ISO 8601
try:
fields[0] = convert_to_iso(fields[0])
except ValueError:
pass # If conversion fails, keep the original value
# Write the transformed line to the CSV file
writer.writerow(fields)
# sanity check; all lines accounted for?
if line_count != info_count + timing_count + request_count + error_count:
print(f"Error in the log file - some lines were not classified.")
if verbose:
print(f"Total line: {line_count}")
print(f"Info: {info_count}")
print(f"Timing: {timing_count}")
print(f"Request: {request_count}")
print(f"Error: {error_count}")
print(f"Unknown: {unknown_count}")
def main():
parser = argparse.ArgumentParser(description="Process raw BES log files, turning them into CSV files.")
parser.add_argument("-v", "--verbose", help="increase output verbosity", action="store_true")
parser.add_argument("-s", "--split", help="split the csv file into N files, one for each PID",
action="store_true")
parser.add_argument("-i", "--input", help="The log file (raw or CSV) to process.", required=True)
# parser.add_argument("providers", nargs="*")
args = parser.parse_args()
global verbose
verbose = args.verbose
# This kludge means this can be called with a file that is already in CSV
# form and have that file split up OR is can be called by a 'raw' log and
# have that turned into CSV and, maybe, split.
if args.split and os.path.splitext(args.input)[1] == ".csv": # hack; look for csv file
split_csv_by_pid(args.input)
else:
input_csv = f"{os.path.splitext(args.input)[0]}.csv"
transform_logs_to_csv(args.input, input_csv)
if args.split:
split_csv_by_pid(input_csv)
if __name__ == "__main__":
main()