-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathheader_analysis.py
executable file
·153 lines (119 loc) · 4.39 KB
/
header_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
#
# header_analysis.py: performs an analysis of STL headers in C++
# software projects.
#
# Usage : ./header_analysis.py DIR
# Output: SVG files of header counts (histogram and matrix)
import os
import re
import sys
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
# All system headers defined according to the C++17 standard. This
# should cover most codebases.
system_headers = set( [
"algorithm", "any", "array", "atomic",
"bitset",
"chrono", "codecvt", "complex", "condition_variable",
"deque",
"exception", "execution",
"filsystem", "forward_list", "fstream", "functional", "future",
"initializer_list", "iomanip", "ios", "iosfwd", "iostream", "istream", "iterator",
"limits", "list", "locale",
"map", "memory", "memory_resource", "mutex",
"new", "numeric",
"optional", "ostream",
"queue",
"random", "ratio", "regex",
"scoped_allocator", "set", "shared_mutex", "sstream", "stack", "stdexcept", "streambuf", "string", "string_view", "strstream", "system_error",
"thread", "tuple", "type_traits", "typeindex", "typeinfo",
"unordered_map", "unordered_set", "utility",
"valarray", "variant", "vector" ]
)
def process_file(filename):
"""
Extracts the includes from a matching file and returns them as a set
in order to prevent duplicates.
"""
re_include = r'\s*#include\s+[<\"]([^>\"]+)[>\"].*'
headers = set()
with open(filename, encoding='iso8859-1') as f:
for line in f:
matches = re.match(re_include, line)
if matches:
name = matches.group(1)
name = name.strip()
headers.add(name)
return headers
def process_headers(headers):
"""
Processes a set of headers by tallying a total count for the
occurrences of system headers while ignoring all headers that
have been defined by users.
"""
headers = [ header for header in headers if header in system_headers ]
return headers
# List of file extensions that may potentially contain C++ code. This
# should cover most conventional codebases.
extensions = [ ".cc", ".C", ".cxx", ".cpp", ".h", ".hh", ".hxx", ".hpp" ]
root = sys.argv[1]
header_counts = Counter()
header_cooccurrences = Counter()
for path, directories, files in os.walk(root):
for name in files:
filename = os.path.join(path, name)
extension = os.path.splitext(filename)[1]
if extension in extensions:
headers = process_file(filename)
headers = process_headers(headers)
for header1 in headers:
for header2 in headers:
if header1 < header2:
header_cooccurrences[ (header1, header2) ] += 1
header_counts.update(headers)
labels = []
counts = []
total = sum(header_counts.values())
# Prepare labels and counts; this ensures that everything is sorted
# according to the counts.
for header, count in header_counts.most_common():
labels.append(header)
counts.append(count / total)
print("Dominant headers (accounting for 50% of all usages):")
s = 0.0
for header, count in header_counts.most_common():
print(" -", header)
s += count / total
if s >= 0.50:
break
plt.rcParams["svg.fonttype"] = "path"
plt.rcParams["font.sans-serif"] = "Myriad Pro"
plt.rcParams["font.size"] = 8
########################################################################
# Plot 1: Individual header counts
########################################################################
plt.bar(range(len(labels)), counts, align="center")
plt.xticks(range(len(labels)), labels, rotation="vertical")
plt.savefig("Header_histogram.svg")
########################################################################
# Plot 2: Co-occurrences
########################################################################
header_to_index = dict()
for index, header in enumerate(labels):
header_to_index[header] = index
cooccurrence_matrix = np.zeros((len(labels), len(labels)))
for (header1,header2),count in header_cooccurrences.most_common():
u = header_to_index[header1]
v = header_to_index[header2]
cooccurrence_matrix[u,v] = count
cooccurrence_matrix[v,u] = count
plt.matshow(np.log(cooccurrence_matrix+1))
plt.xticks(range(len(labels)), labels, rotation="vertical")
plt.yticks(range(len(labels)), labels)
ax = plt.gca()
# TODO: fix text in cells...
#for (i,j), z in np.ndenumerate(cooccurrence_matrix):
# ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center')
plt.savefig("Header_coocurrences.svg")