forked from googleprojectzero/functionsimsearch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctionsimhash.hpp
132 lines (109 loc) · 5.42 KB
/
functionsimhash.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef FUNCTIONSIMHASH_HPP
#define FUNCTIONSIMHASH_HPP
#include <cstdint>
#include <map>
#include <memory>
#include <mutex>
#include <string>
#include <tuple>
#include <vector>
#include "CodeObject.h"
#include "flowgraph.hpp"
#include "flowgraphutil.hpp"
#include "util.hpp"
typedef std::tuple<std::string, std::string, std::string> MnemTuple;
// A class to perform per-function SimHash calculation.
//
// SimHash was introduced in the paper "Similarity Estimation Techniques from
// Rounding Algorithms" (Moses S. Charikar).
//
// The basic idea proceeds as follows:
// 1) Cut your input into pieces, possibly of heterogenous type.
// 2) Keep a vector of N floats, initialized to all zeroes initially.
// 3) For each piece, obtain a weight.
// 4) Hash the piece into N bits; for each 1-bit, add the weight to the float
// at the right index, for each 0-bit, subtract the weight.
// 5) Convert the vector of floats to a vector of zeroes and ones again.
//
// The weights themselves are given in a flat text file. The keys in this
// map are the hash values of the feature using the first hash function of the
// hash family (subsequent hash functions are used for the calculations).
class FunctionSimHasher {
public:
// The weight_file is a simple memory-mapped map that maps uint64_t IDs for
// a feature to float weights.
FunctionSimHasher(const std::string& weight_file, bool verbose = false,
double default_mnemomic_weight = 0.05, double default_graphlet_weight = 1.0);
FunctionSimHasher(std::map<uint64_t, float>* weights);
// Calculate a simhash value for a given function. Outputs a vector of 64-bit
// values, number_of_outputs describes how many bits of SimHash should be
// calculated. Reasonable use is usually 128.
void CalculateFunctionSimHash(
Dyninst::ParseAPI::Function* function, uint64_t number_of_outputs,
std::vector<uint64_t>* output_simhash_values);
void CalculateFunctionSimHash(
std::vector<FeatureHash>* features, std::vector<uint64_t>* output);
static uint64_t FloatsToBits(const std::vector<float>& floats);
static bool FloatsToBits(const std::vector<float>& floats,
std::vector<uint64_t>* outputs);
private:
// Process one subgraph and hash it into the output vector.
void ProcessSubgraph(std::unique_ptr<Flowgraph>& graph, address node,
uint64_t bits, uint64_t simhash_index,
std::vector<float>* output_simhash_floats) const;
// Process one mnemonic n-gram and hash it into the output vector.
void ProcessMnemTuple(const MnemTuple &tup, uint64_t bits,
uint64_t hash_index, std::vector<float>* output_simhash_floats) const;
// Given an n-bit hash and a weight, hash the weight into the output vector
// with positive sign for 1's and negative sign for 0's.
void AddWeightsInHashToOutput(const std::vector<uint64_t>& hash, uint64_t bits,
float weight, std::vector<float>* output_simhash_floats) const;
// Helper function to obtain seed values for a given hash index.
inline uint64_t SeedXForHashY(uint64_t seed_index, uint64_t hash_index) const;
// Hash a tuple of mnemonics into a 64-bit value, using a hash family index.
uint64_t HashMnemTuple(const MnemTuple& tup, uint64_t hash_index) const;
// Hash a graph into a 64-bit value, using a hash family index.
uint64_t HashGraph(std::unique_ptr<Flowgraph>& graph, address node,
uint64_t hash_index, uint64_t counter) const;
// Extend a 64-bit graph hash family to a N-bit hash family by just increasing
// the hash function index.
void CalculateNBitGraphHash(std::unique_ptr<Flowgraph>& graph, address node,
uint64_t bits, uint64_t hash_index, std::vector<uint64_t>* output) const;
// Extend a 64-bit mnemonic tuple hash family to a N-bit hash family by just
// increasing the hash function index.
void CalculateNBitMnemTupleHash(const MnemTuple& tup, uint64_t bits,
uint64_t hash_index, std::vector<uint64_t>* output) const;
// Return a weight for a given key.
float GetWeight(uint64_t key, float standard) const;
float GetGraphletWeight(std::unique_ptr<Flowgraph>& graph, address node) const;
float GetMnemTupleWeight(const MnemTuple& tup) const;
inline bool GetNthBit(const std::vector<uint64_t>& nbit_hash,
uint64_t bitindex) const;
// Given a Dyninst function, create the set of instruction 3-grams.
void BuildMnemonicNgrams(Dyninst::ParseAPI::Function* function,
std::vector<MnemTuple>* tuples) const;
void DumpFloatState(std::vector<float>* output_floats);
std::map<uint64_t, float> weights_;
bool verbose_ = false;
mutable std::mutex mutex_;
double default_mnemonic_weight_;
double default_graphlet_weight_;
// Some primes between 2^63 and 2^64 from CityHash.
static constexpr uint64_t seed0_ = 0xc3a5c85c97cb3127ULL;
static constexpr uint64_t seed1_ = 0xb492b66fbe98f273ULL;
static constexpr uint64_t seed2_ = 0x9ae16a3b2f90404fULL;
};
#endif // FUNCTIONSIMHASH_HPP