Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[llvm] Add IR2Vec as an observation space #560

Open
wants to merge 20 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 47 additions & 1 deletion WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,6 @@ cc_library(
name = "fmt",
srcs = glob(["src/*.cc"]),
hdrs = glob(["include/fmt/*.h"]),
copts = ["-Iexternal/fmt/include"],
strip_include_prefix = "include",
visibility = ["//visibility:public"],
)
Expand Down Expand Up @@ -342,3 +341,50 @@ http_archive(
load("@programl//tools:bzl/deps.bzl", "programl_deps")

programl_deps()

# === IR2Vec ===
# https://github.com/IITH-Compilers/IR2Vec

http_archive(
name = "ir2vec",
build_file_content = """
genrule(
name = "version",
outs = ["version.h"],
cmd = "echo '#define IR2VEC_VERSION \\"1\\"' > $@",
)

cc_library(
name = "ir2vec",
srcs = glob(["src/*.cpp"]) + [":version.h"],
hdrs = glob(["src/include/*.h"]),
copts = ["-Iexternal/ir2vec/src/include"],
strip_include_prefix = "src/include",
visibility = ["//visibility:public"],
deps = [
"@eigen//:eigen",
"@llvm//10.0.0",
],
)
""",
sha256 = "f6c5af059840889e584c13331fabc6a469c40cdf0e44b3284e7db4fe9093289c",
strip_prefix = "IR2Vec-828e50584b9c8bc305208e22d2cca272bdb1ab64",
urls = ["https://github.com/ChrisCummins/IR2Vec/archive/828e50584b9c8bc305208e22d2cca272bdb1ab64.tar.gz"],
)

# === Eigen ===
# https://eigen.tuxfamily.org/index.php?title=Main_Page

http_archive(
name = "eigen",
build_file_content = """
cc_library(
name = "eigen",
hdrs = glob(["Eigen/**/*"]),
visibility = ["//visibility:public"],
)
""",
sha256 = "d56fbad95abf993f8af608484729e3d87ef611dd85b3380a8bad1d5cbc373a57",
strip_prefix = "eigen-3.3.7",
urls = ["https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.tar.gz"],
)
3 changes: 3 additions & 0 deletions compiler_gym/envs/llvm/service/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ filegroup(
name = "service",
srcs = [
":compiler_gym-llvm-service",
# Runtime data dependencies:
"//compiler_gym/third_party/ir2vec:embeddings",
] + select({
"@llvm//:darwin": [],
"//conditions:default": [
Expand Down Expand Up @@ -260,6 +262,7 @@ cc_library(
"//compiler_gym/util:GrpcStatusMacros",
"@boost//:filesystem",
"@glog",
"@ir2vec",
"@llvm//10.0.0",
"@magic_enum",
"@nlohmann_json//:json",
Expand Down
5 changes: 4 additions & 1 deletion compiler_gym/envs/llvm/service/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ cg_add_all_subdirs()
set(_DEPS "compiler_gym-llvm-service")
cg_filegroup(
NAME "service"
DEPENDS ${_DEPS}
DEPENDS
${_DEPS}
compiler_gym::third_party::ir2vec::embeddings
)

cg_cc_binary(
Expand Down Expand Up @@ -236,6 +238,7 @@ cg_cc_library(
CpuInfo::cpuinfo
Boost::filesystem
glog::glog
ir2vec::ir2vec
${_LLVM_LIBS}
magic_enum
nlohmann_json::nlohmann_json
Expand Down
69 changes: 67 additions & 2 deletions compiler_gym/envs/llvm/service/Observation.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@
#include "compiler_gym/third_party/autophase/InstCount.h"
#include "compiler_gym/third_party/llvm/InstCount.h"
#include "compiler_gym/util/GrpcStatusMacros.h"
#include "compiler_gym/util/RunfilesPath.h"
#include "llvm/Bitcode/BitcodeWriter.h"
// #include "llvm/IR/Metadata.h"
#include "IR2Vec.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/raw_ostream.h"
#include "nlohmann/json.hpp"
Expand Down Expand Up @@ -77,17 +79,80 @@ Status setObservation(LlvmObservationSpace space, const fs::path& workingDirecto
break;
}
case LlvmObservationSpace::INST_COUNT: {
const auto features = InstCount::getFeatureVector(benchmark.module());
InstCountFeatureVector features = InstCount::getFeatureVector(benchmark.module());
*reply.mutable_int64_tensor()->mutable_shape()->Add() = features.size();
*reply.mutable_int64_tensor()->mutable_value() = {features.begin(), features.end()};
break;
}
case LlvmObservationSpace::AUTOPHASE: {
const auto features = autophase::InstCount::getFeatureVector(benchmark.module());
const std::vector<int64_t> features =
autophase::InstCount::getFeatureVector(benchmark.module());
*reply.mutable_int64_tensor()->mutable_shape()->Add() = features.size();
*reply.mutable_int64_tensor()->mutable_value() = {features.begin(), features.end()};
break;
}
case LlvmObservationSpace::IR2VEC_FLOW_AWARE: {
const auto ir2vecEmbeddingsPath = util::getRunfilesPath(
"compiler_gym/third_party/ir2vec/seedEmbeddingVocab-300-llvm10.txt");

IR2Vec::Embeddings embeddings(benchmark.module(), IR2Vec::IR2VecMode::FlowAware,
ir2vecEmbeddingsPath.string());
const IR2Vec::Vector& features = embeddings.getProgramVector();
reply.mutable_float_tensor()->mutable_shape()->Add(features.size());
*reply.mutable_float_tensor()->mutable_value() = {features.begin(), features.end()};
break;
}
case LlvmObservationSpace::IR2VEC_SYMBOLIC: {
const auto ir2vecEmbeddingsPath = util::getRunfilesPath(
"compiler_gym/third_party/ir2vec/seedEmbeddingVocab-300-llvm10.txt");

IR2Vec::Embeddings embeddings(benchmark.module(), IR2Vec::IR2VecMode::Symbolic,
ir2vecEmbeddingsPath.string());
const llvm::SmallVector<double, 300>& features = embeddings.getProgramVector();
reply.mutable_float_tensor()->mutable_shape()->Add(features.size());
*reply.mutable_float_tensor()->mutable_value() = {features.begin(), features.end()};
break;
}
case LlvmObservationSpace::IR2VEC_FUNCTION_LEVEL_FLOW_AWARE: {
const auto ir2vecEmbeddingsPath = util::getRunfilesPath(
"compiler_gym/third_party/ir2vec/seedEmbeddingVocab-300-llvm10.txt");
IR2Vec::Embeddings embeddings(benchmark.module(), IR2Vec::IR2VecMode::FlowAware,
ir2vecEmbeddingsPath.string());
const llvm::SmallMapVector<const llvm::Function*, llvm::SmallVector<double, 300>, 16>&
functionMap = embeddings.getFunctionVecMap();

json data;
for (auto function : functionMap) {
data[function.first->getName()] =
std::vector<double>({function.second.begin(), function.second.end()});
}

Opaque opaque;
opaque.set_format("json://");
*opaque.mutable_data() = data.dump();
reply.mutable_any_value()->PackFrom(opaque);
break;
}
case LlvmObservationSpace::IR2VEC_FUNCTION_LEVEL_SYMBOLIC: {
const auto ir2vecEmbeddingsPath = util::getRunfilesPath(
"compiler_gym/third_party/ir2vec/seedEmbeddingVocab-300-llvm10.txt");
IR2Vec::Embeddings embeddings(benchmark.module(), IR2Vec::IR2VecMode::Symbolic,
ir2vecEmbeddingsPath.string());
const llvm::SmallMapVector<const llvm::Function*, llvm::SmallVector<double, 300>, 16>&
functionMap = embeddings.getFunctionVecMap();

json data;
for (auto function : functionMap) {
data[function.first->getName()] =
std::vector<double>({function.second.begin(), function.second.end()});
}

Opaque opaque;
opaque.set_format("json://");
*opaque.mutable_data() = data.dump();
reply.mutable_any_value()->PackFrom(opaque);
break;
}
case LlvmObservationSpace::PROGRAML:
case LlvmObservationSpace::PROGRAML_JSON: {
// Build the ProGraML graph.
Expand Down
86 changes: 86 additions & 0 deletions compiler_gym/envs/llvm/service/ObservationSpaces.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ namespace compiler_gym::llvm_service {

// The number of features in the Autophase feature vector.
static constexpr size_t kAutophaseFeatureDim = 56;

// The number of features in the IR2Vec feature vector.
static constexpr size_t kIR2VecFeatureDim = 300;

// 4096 is the maximum path length for most filesystems.
static constexpr size_t kMaximumPathLength = 4096;

Expand Down Expand Up @@ -116,6 +120,88 @@ std::vector<ObservationSpace> getLlvmObservationSpaceList() {
->Add(low.begin(), low.end());
break;
}
case LlvmObservationSpace::IR2VEC_FLOW_AWARE: {
FloatBox& featureSizes = *space.mutable_float_box();

FloatTensor& featureSizesLow = *featureSizes.mutable_low();
featureSizesLow.add_shape(kIR2VecFeatureDim);
const std::vector<float> low(kIR2VecFeatureDim, std::numeric_limits<float>::lowest());
*featureSizesLow.mutable_value() = {low.begin(), low.end()};

FloatTensor& featureSizesHigh = *featureSizes.mutable_high();
featureSizesHigh.add_shape(kIR2VecFeatureDim);
const std::vector<float> high(kIR2VecFeatureDim, std::numeric_limits<float>::max());
*featureSizesHigh.mutable_value() = {high.begin(), high.end()};

observationSpace.set_deterministic(true);
observationSpace.set_platform_dependent(false);

FloatTensor* defaultObservation =
observationSpace.mutable_default_observation()->mutable_float_tensor();
defaultObservation->add_shape(kIR2VecFeatureDim);
const std::vector<float> defaultValues(kIR2VecFeatureDim, 0.0);
*defaultObservation->mutable_value() = {defaultValues.begin(), defaultValues.end()};
break;
}
case LlvmObservationSpace::IR2VEC_SYMBOLIC: {
FloatBox& featureSizes = *space.mutable_float_box();

FloatTensor& featureSizesLow = *featureSizes.mutable_low();
featureSizesLow.add_shape(kIR2VecFeatureDim);
const std::vector<float> low(kIR2VecFeatureDim, std::numeric_limits<float>::lowest());
*featureSizesLow.mutable_value() = {low.begin(), low.end()};

FloatTensor& featureSizesHigh = *featureSizes.mutable_high();
featureSizesHigh.add_shape(kIR2VecFeatureDim);
const std::vector<float> high(kIR2VecFeatureDim, std::numeric_limits<float>::max());
*featureSizesHigh.mutable_value() = {high.begin(), high.end()};

observationSpace.set_deterministic(true);
observationSpace.set_platform_dependent(false);

FloatTensor* defaultObservation =
observationSpace.mutable_default_observation()->mutable_float_tensor();
defaultObservation->add_shape(kIR2VecFeatureDim);
const std::vector<float> defaultValues(kIR2VecFeatureDim, 0.0);
*defaultObservation->mutable_value() = {defaultValues.begin(), defaultValues.end()};
break;
}
case LlvmObservationSpace::IR2VEC_FUNCTION_LEVEL_FLOW_AWARE: {
observationSpace.set_deterministic(true);
observationSpace.set_platform_dependent(false);

space.mutable_string_value()->mutable_length_range()->set_min(0);

json vectorJson = std::vector<double>(kIR2VecFeatureDim, 0.0);
json functionKey;
json embeddings;
functionKey["default"] = vectorJson;
embeddings["embeddings"] = functionKey;

Opaque opaque;
opaque.set_format("json://");
*opaque.mutable_data() = embeddings.dump();
observationSpace.mutable_default_observation()->mutable_any_value()->PackFrom(opaque);
break;
}
case LlvmObservationSpace::IR2VEC_FUNCTION_LEVEL_SYMBOLIC: {
observationSpace.set_deterministic(true);
observationSpace.set_platform_dependent(false);

space.mutable_string_value()->mutable_length_range()->set_min(0);

json vectorJson = std::vector<double>(kIR2VecFeatureDim, 0.0);
json functionKey;
json embeddings;
functionKey["default"] = vectorJson;
embeddings["embeddings"] = functionKey;

Opaque opaque;
opaque.set_format("json://");
*opaque.mutable_data() = embeddings.dump();
observationSpace.mutable_default_observation()->mutable_any_value()->PackFrom(opaque);
break;
}
case LlvmObservationSpace::PROGRAML: {
// ProGraML serializes the graph to JSON.
space.mutable_string_value()->mutable_length_range()->set_min(0);
Expand Down
58 changes: 55 additions & 3 deletions compiler_gym/envs/llvm/service/ObservationSpaces.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ namespace compiler_gym::llvm_service {
* 1. Add a new entry to this LlvmObservationSpace enum.
* 2. Add a new switch case to getLlvmObservationSpaceList() to return the
* ObserverationSpace.
* 3. Add a new switch case to LlvmSession::getObservation() to compute
* the actual observation.
* 4. Run `bazel test //compiler_gym/...` and update the newly failing tests.
* 3. Add a new switch case to setObservation() to compute the actual
* observation.
* 4. Run `make test` and update the newly failing tests.
*/
enum class LlvmObservationSpace {
/**
Expand All @@ -46,6 +46,58 @@ enum class LlvmObservationSpace {
* deep reinforcement learning. FCCM.
*/
AUTOPHASE,
/**
* The IR2Vec Program Level Flow-Aware embeddings.
*
* From:
*
* S. VenkataKeerthy, Rohit Aggarwal, Shalini Jain, Maunendra Sankar Desarkar,
Ramakrishna Upadrasta, and Y. N. Srikant. (2020).
IR2VEC: LLVM IR Based Scalable Program Embeddings.
ACM Trans. Archit. Code Optim. 17, 4, Article 32 (December 2020), 27 pages.
DOI:https://doi.org/10.1145/3418463
*
*/
IR2VEC_FLOW_AWARE,
/**
* The IR2Vec Program Level Symbolic embeddings.
*
* From:
*
* S. VenkataKeerthy, Rohit Aggarwal, Shalini Jain, Maunendra Sankar Desarkar,
Ramakrishna Upadrasta, and Y. N. Srikant. (2020).
IR2VEC: LLVM IR Based Scalable Program Embeddings.
ACM Trans. Archit. Code Optim. 17, 4, Article 32 (December 2020), 27 pages.
DOI:https://doi.org/10.1145/3418463
*
*/
IR2VEC_SYMBOLIC,
/**
* The IR2Vec Function level Flow Aware embeddings.
*
* From:
*
* S. VenkataKeerthy, Rohit Aggarwal, Shalini Jain, Maunendra Sankar Desarkar,
Ramakrishna Upadrasta, and Y. N. Srikant. (2020).
IR2VEC: LLVM IR Based Scalable Program Embeddings.
ACM Trans. Archit. Code Optim. 17, 4, Article 32 (December 2020), 27 pages.
DOI:https://doi.org/10.1145/3418463
*
*/
IR2VEC_FUNCTION_LEVEL_FLOW_AWARE,
/**
* The IR2Vec Function level Symbolic embeddings.
*
* From:
*
* S. VenkataKeerthy, Rohit Aggarwal, Shalini Jain, Maunendra Sankar Desarkar,
Ramakrishna Upadrasta, and Y. N. Srikant. (2020).
IR2VEC: LLVM IR Based Scalable Program Embeddings.
ACM Trans. Archit. Code Optim. 17, 4, Article 32 (December 2020), 27 pages.
DOI:https://doi.org/10.1145/3418463
*
*/
IR2VEC_FUNCTION_LEVEL_SYMBOLIC,
/**
* Returns the graph representation of a program as a networkx Graph.
*
Expand Down
12 changes: 12 additions & 0 deletions compiler_gym/third_party/ir2vec/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
# IR2Vec. https://github.com/IITH-Compilers/IR2Vec

filegroup(
name = "embeddings",
srcs = ["seedEmbeddingVocab-300-llvm10.txt"],
visibility = ["//visibility:public"],
)
12 changes: 12 additions & 0 deletions compiler_gym/third_party/ir2vec/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

cg_add_all_subdirs()

cg_filegroup(
NAME "embeddings"
FILES
"${CMAKE_CURRENT_LIST_DIR}/seedEmbeddingVocab-300-llvm10.txt"
)
Loading