-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathentropy.py
105 lines (77 loc) · 3.14 KB
/
entropy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import csv
import math
import operator
def main():
# Frequency dictionary for each value in the netflow record
src_address = dict()
dst_address = dict()
src_port = dict()
dst_port = dict()
# list of paths, as records can be spread across multiple csv files
paths = []
print("type 'done' to execute'\n")
print("Target output csv: ", end="")
target_path = input()
# Erase the target file
target = open(target_path, 'w')
target.truncate()
target.close()
# ask for multiple csv's, breaks on 'done'
while(True):
print("Filepath: ", end="")
filepath = input()
if filepath == "done":
break
paths.append(filepath);
# Show that this step has been started
print("working")
# iterate through each file
for path in paths:
with open(path, 'r') as es_data_file:
# iterate through each line in the csv, breaking apart the columns and calling
# dict_add to incremenet the count of each occurrance of each type
for line in es_data_file:
cols = line.split(",")
# increase the count of cols[x] in the dictionary y.
# If the current item doesn't exist in the dictionary, add it, and increase its count by one
dict_add(cols[0], src_address)
dict_add(cols[1], dst_address)
dict_add(cols[2], src_port)
dict_add(cols[3], dst_port)
target = open(target_path, 'w')
calcfreq("Destination Address", dst_address, target)
calcfreq("Source Address", src_address, target)
calcfreq("Destination Port", dst_port, target)
calcfreq("Source Port", src_port, target)
target.close()
print("done")
def calcfreq(dict_name, count_dict, file_target):
file_target.write("\n\n" + dict_name +"\n");
# Get a sorted list of the dictionary elements by the # of record occurrences
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
# sort from large to small
sorted_dict = sorted_dict[::-1]
probability = {}
number_of_records = 0
#Count the total number of records
for element in count_dict:
number_of_records = number_of_records + count_dict[element]
# Write the number of records
file_target.write("Total number of records: " + str(number_of_records) + "\n")
# Count how many lines, to never exceed 500 lines
i = 0
file_target.write("Key,# Records,Probability (count/totalcount),Entropy (-log_2(probability)))\n")
for element in sorted_dict:
if i > 500:
break
probability[element[0]] = count_dict[element[0]] / number_of_records
data = element[0] + "," + str(count_dict[element[0]]) + "," + str(probability[element[0]]) + "," + str((math.log(probability[element[0]], 2.0)* -1)) + "\n"
file_target.write(data)
i += 1
# Increase count in dictionary arg, if it doesn't exist, set count to 1
def dict_add(in_string, in_dictionary):
if in_string in in_dictionary:
in_dictionary[in_string] += 1
else:
in_dictionary[in_string] = 1
main()