Skip to content

Commit

Permalink
Raise loop and chunk limit for newer kernels (#1795)
Browse files Browse the repository at this point in the history
Summary: Dynamically increase the loop limit for newer kernels with
higher instruction limits (1 million for kernels > 5.1) by 21x to reduce
data loss and raise ingest. More details in #1755.

One open question is whether we want to add vizier flag to toggle this
behavior in case there are unforseen performance bottlenecks for certain
clusters.

Type of change: /kind feature

Test Plan: Existing targets + perf/demo tests outlined in #1755.

---------

Signed-off-by: Benjamin Kilimnik <[email protected]>
  • Loading branch information
benkilimnik authored Jan 23, 2024
1 parent 160b065 commit 8cacddc
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
#include "src/stirling/upid/upid.h"

// This keeps instruction count below BPF's limit of 4096 per probe.
#define LOOP_LIMIT 42
#define LOOP_LIMIT BPF_LOOP_LIMIT
#define PROTOCOL_VEC_LIMIT 3

const int32_t kInvalidFD = -1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ struct close_event_t {
// This defines how many chunks a perf_submit can support.
// This applies to messages that are over MAX_MSG_SIZE,
// and effectively makes the maximum message size to be CHUNK_LIMIT*MAX_MSG_SIZE.
#define CHUNK_LIMIT 4
#define CHUNK_LIMIT BPF_CHUNK_LIMIT

// Unique ID to all syscalls and a few other notable functions.
// This applies to events sent to user-space.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

#include "src/common/fs/temp_file.h"
#include "src/common/system/clock.h"
#include "src/common/system/kernel_version.h"
#include "src/common/system/tcp_socket.h"
#include "src/common/system/udp_socket.h"
#include "src/common/system/unix_socket.h"
Expand Down Expand Up @@ -490,9 +491,9 @@ TEST_F(SocketTraceBPFTest, LargeMessages) {
std::string large_response =
"HTTP/1.1 200 OK\r\n"
"Content-Type: application/json; msg2\r\n"
"Content-Length: 131072\r\n"
"Content-Length: 3512768\r\n"
"\r\n";
large_response += std::string(131072, '+');
large_response += std::string(3512768, '+');

testing::SendRecvScript script({
{{kHTTPReqMsg1}, {large_response}},
Expand All @@ -507,19 +508,29 @@ TEST_F(SocketTraceBPFTest, LargeMessages) {
GetMutableConnTracker(system.ClientPID(), system.ClientFD()));
EXPECT_EQ(client_tracker->send_data().data_buffer().Head(), kHTTPReqMsg1);
std::string client_recv_data(client_tracker->recv_data().data_buffer().Head());
EXPECT_THAT(client_recv_data.size(), 131153);
EXPECT_THAT(client_recv_data.size(), 3512850);
EXPECT_THAT(client_recv_data, HasSubstr("+++++"));
EXPECT_EQ(client_recv_data.substr(client_recv_data.size() - 5, 5), "+++++");

// The server's send syscall transmits all 131153 bytes in one shot.
// The server's send syscall transmits all 3512850 bytes in one shot.
// This is over the limit that we can transmit through BPF, and so we expect
// filler bytes on this side of the connection. Note that the client doesn't have the
// filler bytes on this side of the connection (up to 1MB). Note that the client doesn't have the
// same behavior, because the recv syscall provides the data in chunks.
ASSERT_OK_AND_ASSIGN(auto* server_tracker,
GetMutableConnTracker(system.ServerPID(), system.ServerFD()));
EXPECT_EQ(server_tracker->recv_data().data_buffer().Head(), kHTTPReqMsg1);
std::string server_send_data(server_tracker->send_data().data_buffer().Head());
EXPECT_THAT(server_send_data.size(), 131153);
auto kernel = system::GetCachedKernelVersion();
bool kernelGreaterThan5_1 = kernel.version >= 5 || (kernel.version == 5 && kernel.major_rev >= 1);
if (kernelGreaterThan5_1) {
// CHUNK_LIMIT * MAX_MSG_SIZE bytes + up to 1MB filler.
// Currently 84*30720 + 932,370 filler bytes = 3,512,850
EXPECT_THAT(server_send_data.size(), 3512850);
} else {
// CHUNK_LIMIT * MAX_MSG_SIZE bytes + up to 1MB filler.
// Currently 4*30720 + 1024*1024 = 1,171,456, leaving gap of 2,341,394 bytes
EXPECT_THAT(server_send_data.size(), 1171456);
}
EXPECT_THAT(server_send_data, HasSubstr("+++++"));
// We expect filling with \0 bytes.
EXPECT_EQ(server_send_data.substr(server_send_data.size() - 5, 5), ConstStringView("\0\0\0\0\0"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <google/protobuf/text_format.h>
#include <google/protobuf/util/delimited_message_util.h>
#include <magic_enum.hpp>
#include "src/common/system/kernel_version.h"

#include "src/common/base/base.h"
#include "src/common/base/utils.h"
Expand Down Expand Up @@ -175,6 +176,15 @@ DEFINE_bool(
stirling_debug_tls_sources, gflags::BoolFromEnv("PX_DEBUG_TLS_SOURCES", false),
"If true, stirling will add additional prometheus metrics regarding the traced tls sources");

DEFINE_uint32(stirling_bpf_loop_limit, 42,
"The maximum number of iovecs to capture for syscalls. "
"Set conservatively for older kernels by default to keep the instruction count below "
"BPF's limit for version 4 kernels (4096 per probe).");

DEFINE_uint32(stirling_bpf_chunk_limit, 4,
"The maximum number of chunks a perf_submit can support. "
"This applies to messages that are over MAX_MSG_SIZE.");

OBJ_STRVIEW(socket_trace_bcc_script, socket_trace);

namespace px {
Expand Down Expand Up @@ -431,6 +441,18 @@ auto SocketTraceConnector::InitPerfBufferSpecs() {
}

Status SocketTraceConnector::InitBPF() {
// set BPF loop limit and chunk limit based on kernel version
auto kernel = system::GetCachedKernelVersion();
if (kernel.version >= 5 || (kernel.version == 5 && kernel.major_rev >= 1)) {
// Kernels >= 5.1 have higher BPF instruction limits (1 million for verifier).
// This enables a 21x increase to our loop and chunk limits
FLAGS_stirling_bpf_loop_limit = 882;
FLAGS_stirling_bpf_chunk_limit = 84;
LOG(INFO) << absl::Substitute(
"Kernel version greater than V5.1 detected ($0), raised loop limit to $1 and chunk limit "
"to $2",
kernel.ToString(), FLAGS_stirling_bpf_loop_limit, FLAGS_stirling_bpf_chunk_limit);
}
// PROTOCOL_LIST: Requires update on new protocols.
std::vector<std::string> defines = {
absl::StrCat("-DENABLE_TLS_DEBUG_SOURCES=", FLAGS_stirling_debug_tls_sources),
Expand All @@ -445,6 +467,8 @@ Status SocketTraceConnector::InitBPF() {
absl::StrCat("-DENABLE_NATS_TRACING=", protocol_transfer_specs_[kProtocolNATS].enabled),
absl::StrCat("-DENABLE_AMQP_TRACING=", protocol_transfer_specs_[kProtocolAMQP].enabled),
absl::StrCat("-DENABLE_MONGO_TRACING=", protocol_transfer_specs_[kProtocolMongo].enabled),
absl::StrCat("-DBPF_LOOP_LIMIT=", FLAGS_stirling_bpf_loop_limit),
absl::StrCat("-DBPF_CHUNK_LIMIT=", FLAGS_stirling_bpf_chunk_limit),
};
PX_RETURN_IF_ERROR(bcc_->InitBPFProgram(socket_trace_bcc_script, defines));

Expand Down

0 comments on commit 8cacddc

Please sign in to comment.