Skip to content

Commit

Permalink
preserve old behavior for kernels < 5.1 to stay under verifier's 4096…
Browse files Browse the repository at this point in the history
… instruction limit

Signed-off-by: Benjamin Kilimnik <[email protected]>
  • Loading branch information
benkilimnik committed Nov 30, 2023
1 parent 56cc19e commit adf1ec9
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@
#include "src/stirling/upid/upid.h"

// This keeps instruction count below BPF's limit of 4096 per probe.
#define LOOP_LIMIT 42
#define PROTOCOL_VEC_LIMIT 3
#define MAX_FILLER_SIZE (1 * 1024 * 1024) // 1MiB, taken from socket_trace.hpp

const int32_t kInvalidFD = -1;

Expand Down Expand Up @@ -476,13 +476,35 @@ static __inline void perf_submit_buf(struct pt_regs* ctx, const enum traffic_dir
} else if (buf_size_minus_1 < 0x7fffffff) {
// If-statement condition above is only required to prevent clang from optimizing
// away the `if (amount_copied > 0)` below.

// Here we truncate an iovec to MAX_MSG_SIZE (30KiB), then in user space we add a filler
// event if msg_size (size of this iovec) > msg_buf_size.
// If (msg_size - msg_buf_size) > our max filler size of 1MiB, then we push an event with a gap
// to the datastream buffer.
bpf_probe_read(&event->msg, MAX_MSG_SIZE, buf);
amount_copied = MAX_MSG_SIZE;
}

// If-statement is redundant, but is required to keep the 4.14 verifier happy.
if (amount_copied > 0) {
event->attr.msg_buf_size = amount_copied;
// bytes_missed should be 0 if we didn't truncate amount_copied to MAX_MSG_SIZE above.
// Note that perf_submit_buf won't correctly set bytes_missed for perf_submit_iovecs
// when bytes_remaining > iov_size and we've reached the loop limit
// bc it takes only the size of the current iovec into account
// and not the bytes remaining across all iovecs which we drop due to the loop limit.
// In those cases we rely on the value set in perf_submit_iovecs.

// For older kernels < 5.1, we can't record gap metadata without exceeding the instruction
// limit.
if (LOOP_LIMIT > 42 || CHUNK_LIMIT > 4) {
if (event->attr.incomplete_chunk != kExceededLoopLimit) {
event->attr.bytes_missed = event->attr.msg_size - event->attr.msg_buf_size;
}
if (event->attr.bytes_missed > 0 && event->attr.incomplete_chunk == kFullyFormed) {
event->attr.incomplete_chunk = kUnknownGapReason;
}
}
socket_data_events.perf_submit(ctx, event, sizeof(event->attr) + amount_copied);
}
}
Expand All @@ -493,12 +515,23 @@ static __inline void perf_submit_wrapper(struct pt_regs* ctx,
struct socket_data_event_t* event) {
int bytes_sent = 0;
unsigned int i;

event->attr.incomplete_chunk = kFullyFormed;
event->attr.bytes_missed = 0;
#pragma unroll
for (i = 0; i < CHUNK_LIMIT; ++i) {
const int bytes_remaining = buf_size - bytes_sent;
const size_t current_size =
(bytes_remaining > MAX_MSG_SIZE && (i != CHUNK_LIMIT - 1)) ? MAX_MSG_SIZE : bytes_remaining;
// For older kernels < 5.1, we can't record gap metadata without exceeding the instruction
// limit.
if (LOOP_LIMIT > 42 || CHUNK_LIMIT > 4) {
// Check if we have reached the chunk limit, but there are bytes left to capture beyond our
// max msg size.
const bool chunks_not_fully_captured = i == CHUNK_LIMIT - 1 && current_size > MAX_MSG_SIZE;
if (chunks_not_fully_captured) {
event->attr.incomplete_chunk = kExceededChunkLimitAndMaxMsgSize;
}
}
perf_submit_buf(ctx, direction, buf + bytes_sent, current_size, conn_info, event);
bytes_sent += current_size;

Expand All @@ -516,15 +549,41 @@ static __inline void perf_submit_iovecs(struct pt_regs* ctx,
// array order. That means they read or fill iov[0], then iov[1], and so on. They return the total
// size of the written or read data. Therefore, when loop through the buffers, both the number of
// buffers and the total size need to be checked. More details can be found on their man pages.
event->attr.incomplete_chunk = kFullyFormed;
event->attr.bytes_missed = 0;
int bytes_sent = 0;
unsigned int i;
#pragma unroll
for (int i = 0; i < LOOP_LIMIT && i < iovlen && bytes_sent < total_size; ++i) {
for (i = 0; i < LOOP_LIMIT && i < iovlen && bytes_sent < total_size; ++i) {
struct iovec iov_cpy;
BPF_PROBE_READ_VAR(iov_cpy, &iov[i]);

// total bytes we have left to copy across all iovecs
const int bytes_remaining = total_size - bytes_sent;
// bytes contained in this iovec (either bytes we have left or the size of the iovec, whichever
// is smaller) This can be >MAX_MSG_SIZE and thus truncated in perf_submit_buf
const size_t iov_size = min_size_t(iov_cpy.iov_len, bytes_remaining);

// For older kernels < 5.1, we can't record gap metadata without exceeding the instruction
// limit.
if (LOOP_LIMIT > 42 || CHUNK_LIMIT > 4) {
// We have reached the loop limit, but there are iovecs left to capture.
const bool iovec_not_fully_captured = i == LOOP_LIMIT - 1 && i < iovlen;
// This iov exceeds the MAX_MSG_SIZE, and will be truncated in perf_submit_buf.
const bool iov_size_exceeds_max_msg_size = iov_size > MAX_MSG_SIZE;

if (iovec_not_fully_captured && iov_size_exceeds_max_msg_size) {
event->attr.incomplete_chunk = kExceededLoopLimitAndMaxMsgSize;
} else if (iovec_not_fully_captured) {
event->attr.incomplete_chunk = kExceededLoopLimit;
// perf_submit_buf won't correctly set bytes_missed for perf_submit_iovecs
// if bytes_remaining > iov_size and we've reached the loop limit
// because it takes only the size of the current iovec into account
// see min(iov_len, bytes_remaining) above.
event->attr.bytes_missed = bytes_remaining - iov_size;
} else if (iov_size_exceeds_max_msg_size) {
event->attr.incomplete_chunk = kIovSizeExceededMaxMsgSize;
}
}
// TODO(oazizi/yzhao): Should switch this to go through perf_submit_wrapper.
// We don't have the BPF instruction count to do so right now.
perf_submit_buf(ctx, direction, iov_cpy.iov_base, iov_size, conn_info, event);
Expand All @@ -533,9 +592,6 @@ static __inline void perf_submit_iovecs(struct pt_regs* ctx,
// Move the position for the next event.
event->attr.pos += iov_size;
}

// TODO(oazizi): If there is data left after the loop limit, we should still report the remainder
// with a data-less event.
}

/***********************************************************
Expand Down Expand Up @@ -887,6 +943,22 @@ static __inline void process_syscall_sendfile(struct pt_regs* ctx, uint64_t id,
}

event->attr.pos = conn_info->wr_bytes;
// For older kernels < 5.1, we can't record gap metadata without exceeding the instruction
// limit.
if (LOOP_LIMIT > 42 || CHUNK_LIMIT > 4) {
// technically we drop all the data and just send the gap event, filling the gap with \0 bytes
// up to 1MB
if (bytes_count > MAX_FILLER_SIZE) {
// if we exceed the max filler size (1MB), we'll create a gap in the data stream buffer
event->attr.incomplete_chunk = kSendFileExceededMaxFillerSize;
} else {
// If we don't exceed max filler size for this sendfile, we record a complete
// kSendFile to keep track of in our metrics. If filler is enabled (lazy parsing off)
// we will allocate a filler event in user space to fill the gap left by an empty sendfile.
event->attr.incomplete_chunk = kSendFile;
}
event->attr.bytes_missed = bytes_count;
}
event->attr.msg_size = bytes_count;
event->attr.msg_buf_size = 0;
socket_data_events.perf_submit(ctx, event, sizeof(event->attr));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ struct close_event_t {
// This applies to messages that are over MAX_MSG_SIZE,
// and effectively makes the maximum message size to be CHUNK_LIMIT*MAX_MSG_SIZE.
#define CHUNK_LIMIT 4
#define LOOP_LIMIT 42

// Unique ID to all syscalls and a few other notable functions.
// This applies to events sent to user-space.
Expand Down Expand Up @@ -162,6 +163,29 @@ enum source_function_t {
kSSLRead,
};

// Keeps track of the reasons for missed data from bpf, resulting in
// a gap in the data stream buffer (which we sometimes fill with null bytes).
enum chunk_t {
kFullyFormed = 0,
// perf_submit_iovecs
kExceededLoopLimit = 1,
kIovSizeExceededMaxMsgSize = 2,
kExceededLoopLimitAndMaxMsgSize = 3,
// perf_submit_wrapper
kExceededChunkLimitAndMaxMsgSize = 4,
// process_syscall_sendfile
kSendFile = 5,
kSendFileExceededMaxFillerSize = 6,
// filler event (populated in socket_trace.hpp) with size bytes_missed
// TODO(@benkilimnik): eventually we should remove the filler event
// and use lazy parsing across the board.
kFiller = 7,
// gap we tried to fill was larger than max filler size (kMaxFilledSizeBytes, currently 1MB)
kIncompleteFiller = 8,
kHeaderEvent = 9, // no gap
kUnknownGapReason = 10,
};

struct socket_data_event_t {
// We split attributes into a separate struct, because BPF gets upset if you do lots of
// size arithmetic. This makes it so that it's attributes followed by message.
Expand Down Expand Up @@ -195,15 +219,24 @@ struct socket_data_event_t {
// Note that write/send have separate sequences than read/recv.
uint64_t pos;

// The size of the original message. We use this to truncate msg field to minimize the amount
// of data being transferred.
// The size of the original message (or chunk of a message if iovlen > 1
// since each perf_submit passes one iovec as an event). We use
// this to truncate the msg field to minimize the amount of data being transferred.
uint32_t msg_size;

// The amount of data actually being sent to user space. This may be less than msg_size if
// data had to be truncated, or if the data was stripped because we only want to send metadata
// (e.g. if the connection data tracking has been disabled).
uint32_t msg_buf_size;

// Bytes we could not capture (gap size in the data stream buffer)
// Currently keeps track of cases where we exceed CHUNK_LIMIT or LOOP_LIMIT, or truncate in such
// a way that we create a gap. Should be 0 if incomplete_chunk enum is kFullyFormed.
uint32_t bytes_missed;

// Reason for incomplete chunk, if present.
enum chunk_t incomplete_chunk;

// Whether to prepend length header to the buffer for messages first inferred as Kafka. MySQL
// may also use this in this future.
// See infer_kafka_message in protocol_inference.h for details.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,22 @@
// The file name is kept identical to its BPF counterpart as well.

inline std::string ToString(const socket_data_event_t::attr_t& attr) {
return absl::Substitute(
// Since absl::Substitute can handle up to 10 arguments after the format string,
// we concatenate the incomplete_chunk string separately.
std::string base_str = absl::Substitute(
"[ts=$0 conn_id=$1 protocol=$2 role=$3 dir=$4 ssl=$5 source_fn=$6 pos=$7 size=$8 "
"buf_size=$9]",
"buf_size=$9",
attr.timestamp_ns, ToString(attr.conn_id), magic_enum::enum_name(attr.protocol),
magic_enum::enum_name(attr.role), magic_enum::enum_name(attr.direction), attr.ssl,
magic_enum::enum_name(attr.source_fn), attr.pos, attr.msg_size, attr.msg_buf_size);

// Second part: Continue with the next set of attributes.
std::string second_part = absl::Substitute(
" bytes_missed=$0 incomplete_chunk=$1]",
attr.bytes_missed, magic_enum::enum_name(attr.incomplete_chunk));

// Concatenate both parts and return.
return absl::StrCat(base_str, " ", second_part);
}

inline std::string ToString(const close_event_t& event) {
Expand Down Expand Up @@ -104,6 +114,8 @@ struct SocketDataEvent {
header_event_ptr->attr.pos = attr.pos - kHeaderBufSize;
header_event_ptr->attr.msg_buf_size = kHeaderBufSize;
header_event_ptr->attr.msg_size = kHeaderBufSize;
header_event_ptr->attr.incomplete_chunk = kHeaderEvent;
header_event_ptr->attr.bytes_missed = 0;

// Take the length_header from the original, fix byte ordering, and place
// into length_header of the header_event.
Expand All @@ -124,7 +136,7 @@ struct SocketDataEvent {
// For events that which couldn't transfer all its data, we have two options:
// 1) A missing event.
// 2) A filler event.
// A desired filler event is indicated by a msg_size > msg_buf_size when creating the BPF event.
// A desired filler event is indicated by a bytes_missed > 0 when creating the BPF event.
//
// A filler event is used in particular for sendfile data.
// We need a better long-term solution for this,
Expand All @@ -134,28 +146,51 @@ struct SocketDataEvent {

DCHECK_GE(attr.msg_size, attr.msg_buf_size);

if (attr.msg_size > attr.msg_buf_size) {
VLOG(1) << "Adding filler to event";
// Note that msg_size - msg_buf_size != bytes_missed in the case where we exceed LOOP_LIMIT
// in perf_submit_iovecs, because one call to perf_submit_buf takes only the size of the current
// iovec into account, ommitting the rest of the iovecs which could not be submitted.
// As a result, we need to use bytes_missed to determine the size of the filler event.

// For kernels < 5.1, we cannot track the bytes missed in socket_trace.c properly and thus
// preserve the previous behavior of encoding the bytes missed via the msg_size.
// If our loop and chunk limits are at most 42 and 4, then we know that we can
// stay below the verifier instruction limit for kernels < 5.1.
if (LOOP_LIMIT <= 42 && CHUNK_LIMIT <= 4) {
if (attr.msg_size > attr.msg_buf_size) {
DCHECK_EQ(attr.bytes_missed, 0);
attr.bytes_missed = attr.msg_size - attr.msg_buf_size;
}
}
if (attr.bytes_missed > 0) {
VLOG(1) << absl::Substitute("Adding filler event for incomplete_chunk: $0, bytes_missed: $1", magic_enum::enum_name(attr.incomplete_chunk), attr.bytes_missed);

// Limit the size so we don't have huge allocations.
constexpr uint32_t kMaxFilledSizeBytes = 1 * 1024 * 1024;
static char kZeros[kMaxFilledSizeBytes] = {0};

size_t filler_size = attr.msg_size - attr.msg_buf_size;
filler_event_ptr = std::make_unique<SocketDataEvent>();
filler_event_ptr->attr = attr;
size_t filler_size = attr.bytes_missed;
if (filler_size > kMaxFilledSizeBytes) {
VLOG(1) << absl::Substitute("Truncating filler event: $0->$1", filler_size,
kMaxFilledSizeBytes);
filler_size = kMaxFilledSizeBytes;
// incomplete even after filler (bytes_missed > 1MB)
filler_event_ptr->attr.incomplete_chunk = kIncompleteFiller;
filler_event_ptr->attr.bytes_missed -= kMaxFilledSizeBytes;
} else {
// We encode the filler size in bytes_missed for filler events which completely plug a gap (chunk_t kFiller) in our metrics.
// (In reality, bytes missed is 0 since filler plugs the gap.)
// In all other circumstances bytes_missed represents the size of the gap
filler_event_ptr->attr.incomplete_chunk = kFiller;
}

filler_event_ptr = std::make_unique<SocketDataEvent>();
filler_event_ptr->attr = attr;
filler_event_ptr->attr.pos = attr.pos + attr.msg_buf_size;
filler_event_ptr->attr.msg_buf_size = filler_size;
filler_event_ptr->attr.msg_size = filler_size;
filler_event_ptr->msg = std::string_view(kZeros, filler_size);

// We've created the filler event, so adjust the original event accordingly.
DCHECK(filler_size <= attr.bytes_missed);
attr.msg_size = attr.msg_buf_size;
}

Expand Down

0 comments on commit adf1ec9

Please sign in to comment.