Skip to content

Commit

Permalink
commit after running pre-commit hook for the first time
Browse files Browse the repository at this point in the history
  • Loading branch information
Shad Humydee committed Apr 6, 2023
1 parent edb5d92 commit 6d7e7e3
Show file tree
Hide file tree
Showing 8 changed files with 259 additions and 86 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Please enter the location of the file:

### The Highest Hits for a Specific Time Interval

This fetches the maximum hit that came to your site within the specific time range. Here I used --time-interval 10; it
This fetches the maximum hit that came to your site within the specific time range. Here I used --time-interval 10; it
means I want to see all the hits that came within that timeframe by a 10 minutes interval. I also used --topk 3;
meaning it will bring me the latest top 3 results that matches that criteria.

Expand Down Expand Up @@ -99,5 +99,5 @@ Here are some demos of nginx-log-analyzer in action:
## 🚀 About Me
I'm **Shad** Humydee, a Software Engineer in AI/ML who happens to love the DevOps and Architecture side of the Applications.

I currently work in Infolytx (https://infolytx.ai) as a **Software Engineer II** in AI - ML - Architecture
I currently work in Infolytx (https://infolytx.ai) as a **Software Engineer II** in AI - ML - Architecture

39 changes: 26 additions & 13 deletions app/service/insight_provider_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,36 +9,49 @@


class InsightProviderService:
def get_timeframes_by_hit_count(self, logs: List[str], time_interval_in_minutes: int, top_k: int = 3) -> \
List[TimeFrameWiseHitDto]:

def get_timeframes_by_hit_count(
self, logs: List[str], time_interval_in_minutes: int, top_k: int = 3
) -> List[TimeFrameWiseHitDto]:
count_of_hits = 0
timeframe_wise_hit = []

for single_line_log in logs:

if not count_of_hits:
interval_start_time = LogFilterUtil.get_date_time_in_comparable_format(
InfoExtractionUtil.get_timestamp_from_single_line_text(single_line_log)[0]
InfoExtractionUtil.get_timestamp_from_single_line_text(
single_line_log
)[0]
)

interval_end_time = interval_start_time + datetime.timedelta(minutes=time_interval_in_minutes)
interval_end_time = interval_start_time + datetime.timedelta(
minutes=time_interval_in_minutes
)

event_timestamp = LogFilterUtil.get_date_time_in_comparable_format(
InfoExtractionUtil.get_timestamp_from_single_line_text(single_line_log)[0]
InfoExtractionUtil.get_timestamp_from_single_line_text(single_line_log)[
0
]
)
if interval_start_time <= event_timestamp <= interval_end_time:
count_of_hits += 1
else:
timeframe = TimeRangeDto(start_time=interval_start_time.strftime('%d/%b/%Y:%H:%M:%S'),
end_time=interval_end_time.strftime('%d/%b/%Y:%H:%M:%S'))
timeframe_wise_hit.append(TimeFrameWiseHitDto(timeframe=timeframe, hit_count=count_of_hits))
timeframe = TimeRangeDto(
start_time=interval_start_time.strftime("%d/%b/%Y:%H:%M:%S"),
end_time=interval_end_time.strftime("%d/%b/%Y:%H:%M:%S"),
)
timeframe_wise_hit.append(
TimeFrameWiseHitDto(timeframe=timeframe, hit_count=count_of_hits)
)

interval_start_time = interval_end_time
interval_end_time = interval_start_time + datetime.timedelta(minutes=time_interval_in_minutes)
interval_end_time = interval_start_time + datetime.timedelta(
minutes=time_interval_in_minutes
)

count_of_hits = 1

timeframe_wise_hit = sorted(timeframe_wise_hit, key= lambda x: x.hit_count, reverse=True)
timeframe_wise_hit = sorted(
timeframe_wise_hit, key=lambda x: x.hit_count, reverse=True
)

return timeframe_wise_hit[0: top_k]
return timeframe_wise_hit[0:top_k]
16 changes: 8 additions & 8 deletions app/util/info_extraction_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class InfoExtractionUtil:
@staticmethod
def get_ip_from_single_line_text(single_line_text: str) -> List[str]:
regex_pattern_for_ip = re.compile(r'^\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
regex_pattern_for_ip = re.compile(r"^\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b")
ip_addresses = regex_pattern_for_ip.findall(single_line_text)

logger.debug(f"Found IP: {ip_addresses}")
Expand All @@ -16,24 +16,24 @@ def get_ip_from_single_line_text(single_line_text: str) -> List[str]:

@staticmethod
def get_timestamp_from_single_line_text(single_line_text: str) -> List[str]:
regex_pattern_for_timestamp = re.compile(r'\[(\d{2}\/\w{3}\/\d{4}:\d{2}:\d{2}:\d{2})')
regex_pattern_for_timestamp = re.compile(
r"\[(\d{2}\/\w{3}\/\d{4}:\d{2}:\d{2}:\d{2})"
)
timestamp = regex_pattern_for_timestamp.findall(single_line_text)

logger.debug(f"Timestamp (UTC): {timestamp}")

return timestamp

@staticmethod
def get_status_code_from_single_line_text(single_line_text: str) -> Union[str, None]:
regex_pattern_for_timestamp = re.compile(r'HTTP/1.1*. ([0-9][0-9][0-9])')
def get_status_code_from_single_line_text(
single_line_text: str,
) -> Union[str, None]:
regex_pattern_for_timestamp = re.compile(r"HTTP/1.1*. ([0-9][0-9][0-9])")
status_code = regex_pattern_for_timestamp.findall(single_line_text)

logger.debug(f"Status Code: {status_code}")

if status_code:
return status_code[0]
return None




1 change: 1 addition & 0 deletions app/util/log_file_handler_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def get_new_file_location() -> str:

return new_file_location


def set_new_file_location(access_log_location: str) -> str:
new_file_location = access_log_location

Expand Down
22 changes: 17 additions & 5 deletions app/util/log_filter_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,33 @@

class LogFilterUtil:
@staticmethod
def get_date_time_in_comparable_format(date_time_as_string: str, format: str = '%d/%b/%Y:%H:%M:%S'):
def get_date_time_in_comparable_format(
date_time_as_string: str, format: str = "%d/%b/%Y:%H:%M:%S"
):
return datetime.strptime(date_time_as_string, format)

@staticmethod
def get_logs_within_timeframe(logs_to_filter: List[str], time_range: TimeRangeDto):
logs_within_timeframe = []

start_time = LogFilterUtil.get_date_time_in_comparable_format(time_range.start_time)
start_time = LogFilterUtil.get_date_time_in_comparable_format(
time_range.start_time
)
end_time = LogFilterUtil.get_date_time_in_comparable_format(time_range.end_time)

for single_line_log in logs_to_filter:
time_stamp = InfoExtractionUtil.get_timestamp_from_single_line_text(single_line_log)
time_stamp = InfoExtractionUtil.get_timestamp_from_single_line_text(
single_line_log
)
if time_stamp:
time_stamp_as_string = InfoExtractionUtil.get_timestamp_from_single_line_text(single_line_log)[0]
time_stamp = LogFilterUtil.get_date_time_in_comparable_format(time_stamp_as_string)
time_stamp_as_string = (
InfoExtractionUtil.get_timestamp_from_single_line_text(
single_line_log
)[0]
)
time_stamp = LogFilterUtil.get_date_time_in_comparable_format(
time_stamp_as_string
)

if start_time <= time_stamp <= end_time:
logs_within_timeframe.append(single_line_log)
Expand Down
128 changes: 98 additions & 30 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ def store_hit_record(ip_wise_hits: Dict, ip_address: str, status_code: int) -> D
return ip_wise_hits


def get_filtered_ip_wise_hits(ip_wise_hits: Dict[str, Dict[int, int]], min_hit_count) -> Dict[str, Dict[int, int]]:
def get_filtered_ip_wise_hits(
ip_wise_hits: Dict[str, Dict[int, int]], min_hit_count
) -> Dict[str, Dict[int, int]]:
filtered_ip_wise_hits = {}

for ip_address, status_code_wise_hit_count in ip_wise_hits.items():
Expand All @@ -43,29 +45,65 @@ def get_filtered_ip_wise_hits(ip_wise_hits: Dict[str, Dict[int, int]], min_hit_c


if __name__ == "__main__":

parser = argparse.ArgumentParser(description="Nginx Access Log Analytics")

min_start_time = (datetime.min + timedelta(days=365 * 2000)).strftime('%d/%b/%Y:%H:%M:%S')
max_end_time = datetime.max.strftime('%d/%b/%Y:%H:%M:%S')

parser.add_argument("--start-time", nargs='?', type=str, default=min_start_time,
help="--start-time 15/Mar/2023:12:00:00")
parser.add_argument("--end-time", nargs='?', type=str, default=max_end_time,
help="--end-time 15/Mar/2023:17:35:00")
parser.add_argument("--min-hit-count", nargs='?', type=int, default=1, help="--min-hit-count 5")
parser.add_argument("--status-code-wise", nargs='?', type=bool, default=False, help="--status-code-wise True")
parser.add_argument("--timestamp", nargs='?', type=bool, default=False, help="--timestamp True")
parser.add_argument("--time-interval", nargs='?', type=int, default=False,
help="Get maximum hits in a time interval in minutes. Example: --time-interval 10")
parser.add_argument("--topk", nargs='?', type=int, default=False,
help="Top k timeframes when the hit was maximum Example: --topk 3")
parser.add_argument("--file-location", nargs='?', type=str, default=False,
help="Set location of the Access Log File: Example: --file Desktop/access.log")
min_start_time = (datetime.min + timedelta(days=365 * 2000)).strftime(
"%d/%b/%Y:%H:%M:%S"
)
max_end_time = datetime.max.strftime("%d/%b/%Y:%H:%M:%S")

parser.add_argument(
"--start-time",
nargs="?",
type=str,
default=min_start_time,
help="--start-time 15/Mar/2023:12:00:00",
)
parser.add_argument(
"--end-time",
nargs="?",
type=str,
default=max_end_time,
help="--end-time 15/Mar/2023:17:35:00",
)
parser.add_argument(
"--min-hit-count", nargs="?", type=int, default=1, help="--min-hit-count 5"
)
parser.add_argument(
"--status-code-wise",
nargs="?",
type=bool,
default=False,
help="--status-code-wise True",
)
parser.add_argument(
"--timestamp", nargs="?", type=bool, default=False, help="--timestamp True"
)
parser.add_argument(
"--time-interval",
nargs="?",
type=int,
default=False,
help="Get maximum hits in a time interval in minutes. Example: --time-interval 10",
)
parser.add_argument(
"--topk",
nargs="?",
type=int,
default=False,
help="Top k timeframes when the hit was maximum Example: --topk 3",
)
parser.add_argument(
"--file-location",
nargs="?",
type=str,
default=False,
help="Set location of the Access Log File: Example: --file Desktop/access.log",
)
args = parser.parse_args()

if args.start_time == min_start_time and args.end_time == max_end_time:
logger.info(f"Fetching Entire Log")
logger.info("Fetching Entire Log")
elif args.start_time == min_start_time:
logger.info(f"Fetching Logs from the Beginning to {args.end_time}")
elif args.end_time == max_end_time:
Expand All @@ -78,30 +116,43 @@ def get_filtered_ip_wise_hits(ip_wise_hits: Dict[str, Dict[int, int]], min_hit_c

log_file_location = get_file_location()

with open(log_file_location, 'r') as file:
with open(log_file_location, "r") as file:
log_file_contents = file.read().split("\n")

filtered_log_file_contents = LogFilterUtil.get_logs_within_timeframe(log_file_contents, args)
filtered_log_file_contents = LogFilterUtil.get_logs_within_timeframe(
log_file_contents, args
)

ip_wise_hits = {}
ip_wise_access_timestamp = {}
total_hit_count = len(filtered_log_file_contents)

for single_line_log in filtered_log_file_contents:
extracted_ip = InfoExtractionUtil.get_ip_from_single_line_text(single_line_log)[0]
status_code = InfoExtractionUtil.get_status_code_from_single_line_text(single_line_log)
extracted_ip = InfoExtractionUtil.get_ip_from_single_line_text(single_line_log)[
0
]
status_code = InfoExtractionUtil.get_status_code_from_single_line_text(
single_line_log
)

ip_wise_hits = store_hit_record(ip_wise_hits, extracted_ip, status_code)

event_timestamp = InfoExtractionUtil.get_timestamp_from_single_line_text(single_line_log)[0]
event_timestamp = InfoExtractionUtil.get_timestamp_from_single_line_text(
single_line_log
)[0]

if extracted_ip not in ip_wise_access_timestamp:
ip_wise_access_timestamp[extracted_ip] = [event_timestamp]
else:
ip_wise_access_timestamp[extracted_ip].append(event_timestamp)

ip_wise_hits = {k: dict(sorted(v.items(), key=lambda x: x[1], reverse=True)) for k, v in ip_wise_hits.items()}
ip_wise_hits = dict(sorted(ip_wise_hits.items(), key=lambda x: sum(x[1].values()), reverse=True))
ip_wise_hits = {
k: dict(sorted(v.items(), key=lambda x: x[1], reverse=True))
for k, v in ip_wise_hits.items()
}
ip_wise_hits = dict(
sorted(ip_wise_hits.items(), key=lambda x: sum(x[1].values()), reverse=True)
)

filtered_ip_wise_hits = get_filtered_ip_wise_hits(ip_wise_hits, args.min_hit_count)

Expand All @@ -116,7 +167,13 @@ def get_filtered_ip_wise_hits(ip_wise_hits: Dict[str, Dict[int, int]], min_hit_c

for status_code, hit_count in status_code_wise_hit_count.items():
if args.status_code_wise:
print("Status Code " + "\033[92m" + f"{status_code}: " + "\033[0m" + f"{hit_count} hits")
print(
"Status Code "
+ "\033[92m"
+ f"{status_code}: "
+ "\033[0m"
+ f"{hit_count} hits"
)
total_hit_for_ip += hit_count
print("Total Hit: " + "\033[92m" + f"{total_hit_for_ip}" + "\033[0m")
if args.timestamp:
Expand All @@ -126,7 +183,18 @@ def get_filtered_ip_wise_hits(ip_wise_hits: Dict[str, Dict[int, int]], min_hit_c
print(f"Total Hits from All IP: {total_hit_count} Hits.")

if args.time_interval:
topk_timeframes_by_git_count = insight_provider_service.get_timeframes_by_hit_count(filtered_log_file_contents, args.time_interval, args.topk)
topk_timeframes_by_git_count = (
insight_provider_service.get_timeframes_by_hit_count(
filtered_log_file_contents, args.time_interval, args.topk
)
)
for topk_results in topk_timeframes_by_git_count:
print("Timeframe: " + "\033[33m" + f"{topk_results.timeframe}" + "\033[0m" + "\033[36m" + f" - Hit Count: {topk_results.hit_count}" + "\033[0m")

print(
"Timeframe: "
+ "\033[33m"
+ f"{topk_results.timeframe}"
+ "\033[0m"
+ "\033[36m"
+ f" - Hit Count: {topk_results.hit_count}"
+ "\033[0m"
)
Loading

0 comments on commit 6d7e7e3

Please sign in to comment.