From 56682c429b5c9a00d289ecc55a242733efca204c Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Tue, 16 Apr 2024 19:38:34 +0100 Subject: [PATCH 01/13] Initial foray into route caching. This is not good code. It also appears to give us an extra 200Mbps. --- xde/src/xde.rs | 102 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 2 deletions(-) diff --git a/xde/src/xde.rs b/xde/src/xde.rs index e588b4cd..071cddf4 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -53,6 +53,7 @@ use opte::ddi::sync::KMutexType; use opte::ddi::sync::KRwLock; use opte::ddi::sync::KRwLockType; use opte::ddi::time::Interval; +use opte::ddi::time::Moment; use opte::ddi::time::Periodic; use opte::engine::ether::EtherAddr; use opte::engine::geneve::Vni; @@ -1420,6 +1421,33 @@ fn guest_loopback( ptr::null_mut() } +// XXX: clean this up, after measuring. +// XXX: completely arbitrary timeout. +const MAX_ROUTE_LIFETIME: Duration = Duration::from_millis(100); +static ROUTE_CACHE: KRwLock< + alloc::collections::BTreeMap, +> = KRwLock::new(alloc::collections::BTreeMap::new()); +#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] +struct RouteKey { + dst: Ipv6Addr, + l4_hash: Option, +} + +#[derive(Copy, Clone, Debug)] +struct CachedRoute { + src: EtherAddr, + dst: EtherAddr, + underlay_idx: u8, + timestep: Moment, +} + +impl CachedRoute { + pub fn is_still_valid(&self, t: Moment) -> bool { + t.delta_as_millis(self.timestep) + <= MAX_ROUTE_LIFETIME.as_millis() as u64 + } +} + #[no_mangle] unsafe extern "C" fn xde_mc_tx( arg: *mut c_void, @@ -1537,8 +1565,78 @@ unsafe extern "C" fn xde_mc_tx( // for the mac associated with the IRE nexthop to fill in // the outer frame of the packet. Also return the underlay // device associated with the nexthop - let (src, dst, underlay_dev) = - next_hop(&ip6.dst, src_dev, meta.l4_hash()); + // + // As route lookups are fairly expensive, we can cache their + // results for a given dst + entropy. These have a fairly tight + // expiry so that we can actually react to new reachability/load + // info from DDM. + // XXX: this is an absolute mess. + let my_key = RouteKey { dst: ip6.dst, l4_hash: meta.l4_hash() }; + let mut maybe_route = { + let route_cache = ROUTE_CACHE.read(); + route_cache.get(&my_key).copied() + }; + let t = Moment::now(); + + let recompute = match maybe_route { + None => true, + Some(route) => !route.is_still_valid(t), + }; + let (src, dst, underlay_dev) = if recompute { + let mut route_cache = ROUTE_CACHE.write(); + // Someone else may have written while we were + // taking the lock. DO NOT waste time if it's a + // good route. + maybe_route = route_cache.get(&my_key).copied(); + let recompute = match maybe_route { + None => true, + Some(route) => !route.is_still_valid(t), + }; + if recompute { + let (src, dst, underlay_dev) = + next_hop(&ip6.dst, src_dev, meta.l4_hash()); + + // This is terrible. + let underlay_idx = if underlay_dev as *const _ + == (&*src_dev.u1) as *const _ + { + 0 + } else { + 1 + }; + + route_cache.insert( + my_key, + CachedRoute { src, dst, underlay_idx, timestep: t }, + ); + + (src, dst, underlay_dev) + } else { + let x = maybe_route.unwrap(); + + ( + x.src, + x.dst, + if x.underlay_idx == 0 { + &*src_dev.u1 + } else { + &*src_dev.u2 + }, + ) + } + } else { + let x = maybe_route.unwrap(); + + ( + x.src, + x.dst, + if x.underlay_idx == 0 { + &*src_dev.u1 + } else { + &*src_dev.u2 + }, + ) + }; // Get a pointer to the beginning of the outer frame and // fill in the dst/src addresses before sending out the From 0e91dd13547fa10eb5c2f2fbae7dfc0ec172d9ee Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Tue, 16 Apr 2024 19:48:58 +0100 Subject: [PATCH 02/13] Reuse existing l4 hash... --- xde/src/xde.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 071cddf4..d19cafea 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -1438,12 +1438,12 @@ struct CachedRoute { src: EtherAddr, dst: EtherAddr, underlay_idx: u8, - timestep: Moment, + timestamp: Moment, } impl CachedRoute { pub fn is_still_valid(&self, t: Moment) -> bool { - t.delta_as_millis(self.timestep) + t.delta_as_millis(self.timestamp) <= MAX_ROUTE_LIFETIME.as_millis() as u64 } } @@ -1594,7 +1594,7 @@ unsafe extern "C" fn xde_mc_tx( }; if recompute { let (src, dst, underlay_dev) = - next_hop(&ip6.dst, src_dev, meta.l4_hash()); + next_hop(&ip6.dst, src_dev, my_key.l4_hash); // This is terrible. let underlay_idx = if underlay_dev as *const _ @@ -1607,7 +1607,7 @@ unsafe extern "C" fn xde_mc_tx( route_cache.insert( my_key, - CachedRoute { src, dst, underlay_idx, timestep: t }, + CachedRoute { src, dst, underlay_idx, timestamp: t }, ); (src, dst, underlay_dev) From f313250f8fc5c220eb16b664e837e8912152da47 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 19 Apr 2024 11:38:04 +0100 Subject: [PATCH 03/13] Cleanup implementation, move cache to per-port. --- xde/src/lib.rs | 1 + xde/src/route.rs | 500 +++++++++++++++++++++++++++++++++++++++++++++++ xde/src/xde.rs | 483 ++------------------------------------------- 3 files changed, 521 insertions(+), 463 deletions(-) create mode 100644 xde/src/route.rs diff --git a/xde/src/lib.rs b/xde/src/lib.rs index e4b4f073..e6026582 100644 --- a/xde/src/lib.rs +++ b/xde/src/lib.rs @@ -45,6 +45,7 @@ pub mod dls; pub mod ip; pub mod mac; mod mac_sys; +pub mod route; pub mod secpolicy; pub mod sys; pub mod xde; diff --git a/xde/src/route.rs b/xde/src/route.rs new file mode 100644 index 00000000..8a4ed54d --- /dev/null +++ b/xde/src/route.rs @@ -0,0 +1,500 @@ +use crate::ip; +use crate::sys; +use crate::xde::xde_underlay_port; +use crate::xde::DropRef; +use crate::xde::XdeDev; +use alloc::collections::BTreeMap; +use alloc::sync::Arc; +use core::ptr; +use core::time::Duration; +use illumos_sys_hdrs::*; +use opte::ddi::sync::KRwLock; +use opte::ddi::sync::KRwLockType; +use opte::ddi::time::Moment; +use opte::engine::ether::EtherAddr; +use opte::engine::ip6::Ipv6Addr; + +// XXX: completely arbitrary timeout. +/// The duration a cached route remains valid for. +const MAX_ROUTE_LIFETIME: Duration = Duration::from_millis(100); + +extern "C" { + pub fn __dtrace_probe_next__hop( + dst: uintptr_t, + gw: uintptr_t, + gw_ether_src: uintptr_t, + gw_ether_dst: uintptr_t, + msg: uintptr_t, + ); +} + +fn next_hop_probe( + dst: &Ipv6Addr, + gw: Option<&Ipv6Addr>, + gw_eth_src: EtherAddr, + gw_eth_dst: EtherAddr, + msg: &[u8], +) { + let gw_bytes = gw.unwrap_or(&Ipv6Addr::from([0u8; 16])).bytes(); + + unsafe { + __dtrace_probe_next__hop( + dst.bytes().as_ptr() as uintptr_t, + gw_bytes.as_ptr() as uintptr_t, + gw_eth_src.to_bytes().as_ptr() as uintptr_t, + gw_eth_dst.to_bytes().as_ptr() as uintptr_t, + msg.as_ptr() as uintptr_t, + ); + } +} + +// The following are wrappers for reference drop functions used in XDE. + +fn ire_refrele(ire: *mut ip::ire_t) { + unsafe { ip::ire_refrele(ire) } +} + +fn nce_refrele(ire: *mut ip::nce_t) { + unsafe { ip::nce_refrele(ire) } +} + +fn netstack_rele(ns: *mut ip::netstack_t) { + unsafe { ip::netstack_rele(ns) } +} + +// At this point the core engine of OPTE has delivered a Geneve +// encapsulated guest Ethernet Frame (also simply referred to as "the +// packet") to xde to be sent to the specific outer IPv6 destination +// address. This packet includes the outer Ethernet Frame as well; +// however, the outer frame's destination and source addresses are set +// to zero. It is the job of this function to determine what those +// values should be. +// +// Adjacent to xde is the native IPv6 stack along with its routing +// table. This table is routinely updated to indicate the best path to +// any given IPv6 destination that may be specified in the outer IP +// header. As xde is not utilizing the native IPv6 stack to send out +// the packet, but rather is handing it directly to the mac module, it +// must somehow query the native routing table to determine which port +// this packet should egress and fill in the outer frame accordingly. +// This query is done via a private interface which allows a kernel +// module outside of IP to query the routing table. +// +// This process happens in a sequence of steps described below. +// +// 1. With an IPv6 destination in hand we need to determine the next +// hop, also known as the gateway, for this address. That is, of +// our neighbors (in this case one of the two switches, which are +// also acting as routers), who should we forward this packet to in +// order for it to arrive at its destination? We get this +// information from the routing table, which contains Internet +// Routing Entries, or IREs. Specifically, we query the native IPv6 +// routing table using the kernel function +// `ire_ftable_lookup_simple_v6()`. This function returns an +// `ire_t`, which includes the member `ire_u`, which contains the +// address of the gateway as `ire6_gateway_addr`. +// +// 2. We have the gateway IPv6 address; but in the world of the Oxide +// Network that is not enough to deliver the packet. In the Oxide +// Network the router (switch) is not a member of the host's +// network. Instead, we rely on link-local addresses to reach the +// switches. The lookup in step (1) gave us that link-local address +// of the gateway; now we need to figure out how to reach it. That +// requires consulting the routing table a second time: this time +// to find the IRE for the gateway's link-local address. +// +// 3. The IRE of the link-local address from step (2) allows us to +// determine which interface this traffic should traverse. +// Specifically it gives us access to the `ill_t` of the gateway's +// link-local address. This structure contains the IP Lower Level +// information. In particular it contains the `ill_phys_addr` +// which gives us the source MAC address for our outer frame. +// +// 4. The final piece of information to obtain is the destination MAC +// address. We have the link-local address of the switch port we +// want to send to. To get the MAC address of this port it must +// first be assumed that the host and its connected switches have +// performed NDP in order to learn each other's IPv6 addresses and +// corresponding MAC addresses. With that information in hand it is +// a matter of querying the kernel's Neighbor Cache Entry Table +// (NCE) for the mapping that belongs to our gateway's link-local +// address. This is done via the `nce_lookup_v6()` kernel function. +// +// With those four steps we have obtained the source and destination +// MAC addresses and the packet can be sent to mac to be delivered to +// the underlying NIC. However, the careful reader may find themselves +// confused about how step (1) actually works. +// +// If step (1) always returns a single gateway, then how do we +// actually utilize both NICs/switches? +// +// This is where a bit of knowledge about routing tables comes into +// play along with our very own Delay Driven Multipath in-rack routing +// protocol. You might imagine the IPv6 routing table on an Oxide Sled +// looking something like this. +// +// Destination/Mask Gateway Flags If +// ---------------- ------------------------- ----- --------- +// default fe80:: UG cxgbe0 +// default fe80:: UG cxgbe1 +// fe80::/10 fe80:: U cxgbe0 +// fe80::/10 fe80:: U cxgbe1 +// fd00:::/64 fe80:: U cxgbe0 +// fd00:::/64 fe80:: U cxgbe1 +// +// Let's say this host (sled1) wants to send a packet to sled2. Our +// sled1 host lives on network `fd00:::/64` while our +// sled2 host lives on `fd00:::/64` -- the key point +// being they are two different networks and thus must be routed to +// talk to each other. For sled1 to send this packet it will attempt +// to look up destination `fd00:::7777` (in this case +// `7777` is the IP of sled2) in the routing table above. The routing +// table will then perform a longest prefix match against the +// `Destination` field for all entries: the longest prefix that +// matches wins and that entry is returned. However, in this case, no +// destinations match except for the `default` ones. When more than +// one entry matches it is left to the system to decide which one to +// return; typically this just means the first one that matches. But +// not for us! This is where DDM comes into play. +// +// Let's reimagine the routing table again, this time with a +// probability added to each gateway entry. +// +// Destination/Mask Gateway Flags If P +// ---------------- ------------------------- ----- ------- ---- +// default fe80:: UG cxgbe0 0.70 +// default fe80:: UG cxgbe1 0.30 +// fe80::/10 fe80:: U cxgbe0 +// fe80::/10 fe80:: U cxgbe1 +// fd00:::/64 fe80:: U cxgbe0 +// fd00:::/64 fe80:: U cxgbe1 +// +// With these P values added we now have a new option for deciding +// which IRE to return when faced with two matches: give each a +// probability of return based on their P value. In this case, for any +// given gateway IRE lookup, there would be a 70% chance +// `fe80::` is returned and a 30% chance `fe80::` is +// returned. +// +// But wait, what determines those P values? That's the job of DDM. +// The full story of what DDM is and how it works is outside the scope +// of this already long block comment; but suffice to say it monitors +// the flow of the network based on precise latency measurements and +// with that data constantly refines the P values of all the hosts's +// routing tables to bias new packets towards one path or another. +#[no_mangle] +fn next_hop<'a>( + key: &RouteKey, + ustate: &'a XdeDev, +) -> Result, &'a xde_underlay_port> { + let RouteKey { dst: ip6_dst, l4_hash: overlay_hash } = key; + unsafe { + // Use the GZ's routing table. + let netstack = + DropRef::new(netstack_rele, ip::netstack_find_by_zoneid(0)); + assert!(!netstack.inner().is_null()); + let ipst = (*netstack.inner()).netstack_u.nu_s.nu_ip; + assert!(!ipst.is_null()); + + let addr = ip::in6_addr_t { + _S6_un: ip::in6_addr__bindgen_ty_1 { _S6_u8: key.dst.bytes() }, + }; + let xmit_hint = overlay_hash.unwrap_or(0); + let mut generation_op = 0u32; + + let mut underlay_port = &*ustate.u1; + + // Step (1): Lookup the IRE for the destination. This is going + // to return one of the default gateway entries. + let ire = DropRef::new( + ire_refrele, + ip::ire_ftable_lookup_v6( + &addr, + ptr::null(), + ptr::null(), + 0, + ptr::null_mut(), + sys::ALL_ZONES, + ptr::null(), + 0, + xmit_hint, + ipst, + &mut generation_op as *mut ip::uint_t, + ), + ); + + // TODO If there is no entry should we return host + // unreachable? I'm not sure since really the guest would map + // that with its VPC network. That is, if a user saw host + // unreachable they would be correct to think that their VPC + // routing table is misconfigured, but in reality it would be + // an underlay network issue. How do we convey this situation + // to the user/operator? + if ire.inner().is_null() { + // Try without a pinned ill + opte::engine::dbg!("no IRE for destination {:?}", ip6_dst); + next_hop_probe( + ip6_dst, + None, + EtherAddr::zero(), + EtherAddr::zero(), + b"no IRE for destination\0", + ); + return Err(underlay_port); + } + let ill = (*ire.inner()).ire_ill; + if ill.is_null() { + opte::engine::dbg!("destination ILL is NULL for {:?}", ip6_dst); + next_hop_probe( + ip6_dst, + None, + EtherAddr::zero(), + EtherAddr::zero(), + b"destination ILL is NULL\0", + ); + return Err(underlay_port); + } + + // Step (2): Lookup the IRE for the gateway's link-local + // address. This is going to return one of the `fe80::/10` + // entries. + let ireu = (*ire.inner()).ire_u; + let gw = ireu.ire6_u.ire6_gateway_addr; + let gw_ip6 = Ipv6Addr::from(&ireu.ire6_u.ire6_gateway_addr); + + // NOTE: specifying the ill is important here, because the gateway + // address is going to be of the form fe80::. This means a + // simple query that does not specify an ill could come back with any + // route matching fe80::/10 over any interface. Since all interfaces + // that have an IPv6 link-local address assigned have an associated + // fe80::/10 route, we must restrict our search to the interface that + // actually has a route to the desired (non-link-local) destination. + let flags = ip::MATCH_IRE_ILL as i32; + let gw_ire = DropRef::new( + ire_refrele, + ip::ire_ftable_lookup_v6( + &gw, + ptr::null(), + ptr::null(), + 0, + ill, + sys::ALL_ZONES, + ptr::null(), + flags, + xmit_hint, + ipst, + &mut generation_op as *mut ip::uint_t, + ), + ); + + if gw_ire.inner().is_null() { + opte::engine::dbg!("no IRE for gateway {:?}", gw_ip6); + next_hop_probe( + ip6_dst, + Some(&gw_ip6), + EtherAddr::zero(), + EtherAddr::zero(), + b"no IRE for gateway\0", + ); + return Err(underlay_port); + } + + // Step (3): Determine the source address of the outer frame + // from the physical address of the IP Lower Layer object + // member or the internet routing entry. + let src = (*ill).ill_phys_addr; + if src.is_null() { + opte::engine::dbg!( + "gateway ILL phys addr is NULL for {:?}", + gw_ip6 + ); + next_hop_probe( + ip6_dst, + Some(&gw_ip6), + EtherAddr::zero(), + EtherAddr::zero(), + b"gateway ILL phys addr is NULL\0", + ); + return Err(underlay_port); + } + + let src: [u8; 6] = alloc::slice::from_raw_parts(src, 6) + .try_into() + .expect("src mac from pointer"); + + // Switch to the 2nd underlay device if we determine the source mac + // belongs to that device. + if src == ustate.u2.mac { + underlay_port = &ustate.u2; + } + + let src = EtherAddr::from(src); + + // Step (4): Determine the destination address of the outer + // frame by retrieving the NCE entry for the gateway's + // link-local address. + let nce = DropRef::new(nce_refrele, ip::nce_lookup_v6(ill, &gw)); + if nce.inner().is_null() { + opte::engine::dbg!("no NCE for gateway {:?}", gw_ip6); + next_hop_probe( + ip6_dst, + Some(&gw_ip6), + src, + EtherAddr::zero(), + b"no NCE for gateway\0", + ); + return Err(underlay_port); + } + + let nce_common = (*nce.inner()).nce_common; + if nce_common.is_null() { + opte::engine::dbg!("no NCE common for gateway {:?}", gw_ip6); + next_hop_probe( + ip6_dst, + Some(&gw_ip6), + src, + EtherAddr::zero(), + b"no NCE common for gateway\0", + ); + return Err(underlay_port); + } + + let mac = (*nce_common).ncec_lladdr; + if mac.is_null() { + opte::engine::dbg!("NCE MAC address is NULL {:?}", gw_ip6); + next_hop_probe( + ip6_dst, + Some(&gw_ip6), + src, + EtherAddr::zero(), + b"NCE MAC address if NULL for gateway\0", + ); + return Err(underlay_port); + } + + let maclen = (*nce_common).ncec_lladdr_length; + assert!(maclen == 6); + + let dst: [u8; 6] = alloc::slice::from_raw_parts(mac, 6) + .try_into() + .expect("mac from pointer"); + let dst = EtherAddr::from(dst); + + next_hop_probe(ip6_dst, Some(&gw_ip6), src, dst, b"\0"); + + Ok(Route { src, dst, underlay_dev: underlay_port }) + } +} + +/// A simple caching layer over `next_hop`. +pub struct RouteCache(Arc>>); + +impl Default for RouteCache { + fn default() -> Self { + let mut lock = KRwLock::new(BTreeMap::new()); + lock.init(KRwLockType::Driver); + Self(lock.into()) + } +} + +impl RouteCache { + pub fn next_hop<'b>(&self, key: RouteKey, xde: &'b XdeDev) -> Route<'b> { + let t = Moment::now(); + + let maybe_route = { + let route_cache = self.0.read(); + route_cache.get(&key).copied() + }; + + match maybe_route { + Some(route) if route.is_still_valid(t) => { + return route.into_route(xde) + } + _ => {} + } + + // Cache miss: intent is to now ask illumos, then insert. + let mut route_cache = self.0.write(); + + // Someone else may have written while we were taking the lock. + // DO NOT waste time if there's a good route. + let maybe_route = route_cache.get(&key).copied(); + match maybe_route { + Some(route) if route.is_still_valid(t) => { + return route.into_route(xde) + } + _ => {} + } + + // `next_hop` might fail for myriad reasons, but we still + // send the packet on an underlay device depending on our + // progress. However, we do not want to cache bad mappings. + match next_hop(&key, xde) { + Ok(route) => { + route_cache.insert(key, route.cached(xde, t)); + route + } + Err(underlay_dev) => Route { + src: EtherAddr::zero(), + dst: EtherAddr::zero(), + underlay_dev, + }, + } + } +} + +/// An underlay routing destination and flow-dependent entropy. +#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] +pub struct RouteKey { + pub dst: Ipv6Addr, + pub l4_hash: Option, +} + +/// Cached representation of [`Route`]. +#[derive(Copy, Clone, Debug)] +pub struct CachedRoute { + pub src: EtherAddr, + pub dst: EtherAddr, + pub underlay_idx: u8, + pub timestamp: Moment, +} + +impl CachedRoute { + fn is_still_valid(&self, t: Moment) -> bool { + t.delta_as_millis(self.timestamp) + <= MAX_ROUTE_LIFETIME.as_millis() as u64 + } + + fn into_route(self, xde: &XdeDev) -> Route<'_> { + Route { + src: self.src, + dst: self.dst, + // This is not a pretty construction, and will not work for + // a hypothetically higher port count. + underlay_dev: if self.underlay_idx == 0 { + &xde.u1 + } else { + &xde.u2 + }, + } + } +} + +/// Output port and L2 information needed to emit a packet over the underlay. +#[derive(Copy, Clone, Debug)] +pub struct Route<'a> { + pub src: EtherAddr, + pub dst: EtherAddr, + pub underlay_dev: &'a xde_underlay_port, +} + +impl Route<'_> { + fn cached(&self, xde: &XdeDev, timestamp: Moment) -> CachedRoute { + // As unfortunate as `into_route`. + let port_0: &xde_underlay_port = &xde.u1; + let underlay_idx = + if core::ptr::addr_eq(self.underlay_dev, port_0) { 0 } else { 1 }; + + CachedRoute { src: self.src, dst: self.dst, underlay_idx, timestamp } + } +} diff --git a/xde/src/xde.rs b/xde/src/xde.rs index d19cafea..727f67b4 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -14,7 +14,6 @@ use crate::dls; use crate::ioctl::IoctlEnvelope; -use crate::ip; use crate::mac; use crate::mac::mac_getinfo; use crate::mac::mac_private_minor; @@ -24,6 +23,9 @@ use crate::mac::MacOpenFlags; use crate::mac::MacPromiscHandle; use crate::mac::MacTxFlags; use crate::mac::MacUnicastHandle; +use crate::route::Route; +use crate::route::RouteCache; +use crate::route::RouteKey; use crate::secpolicy; use crate::sys; use crate::warn; @@ -33,7 +35,6 @@ use alloc::string::String; use alloc::string::ToString; use alloc::sync::Arc; use alloc::vec::Vec; -use core::convert::TryInto; use core::ffi::CStr; use core::num::NonZeroU32; use core::ptr; @@ -53,7 +54,6 @@ use opte::ddi::sync::KMutexType; use opte::ddi::sync::KRwLock; use opte::ddi::sync::KRwLockType; use opte::ddi::time::Interval; -use opte::ddi::time::Moment; use opte::ddi::time::Periodic; use opte::engine::ether::EtherAddr; use opte::engine::geneve::Vni; @@ -197,34 +197,14 @@ fn bad_packet_probe( }; } -fn next_hop_probe( - dst: &Ipv6Addr, - gw: Option<&Ipv6Addr>, - gw_eth_src: EtherAddr, - gw_eth_dst: EtherAddr, - msg: &[u8], -) { - let gw_bytes = gw.unwrap_or(&Ipv6Addr::from([0u8; 16])).bytes(); - - unsafe { - __dtrace_probe_next__hop( - dst.bytes().as_ptr() as uintptr_t, - gw_bytes.as_ptr() as uintptr_t, - gw_eth_src.to_bytes().as_ptr() as uintptr_t, - gw_eth_dst.to_bytes().as_ptr() as uintptr_t, - msg.as_ptr() as uintptr_t, - ); - } -} - /// Underlay port state. #[derive(Debug)] -struct xde_underlay_port { +pub struct xde_underlay_port { /// Name of the link being used for this underlay port. - name: String, + pub name: String, /// The MAC address associated with this underlay port. - mac: [u8; 6], + pub mac: [u8; 6], /// MAC handle to the underlay link. mh: Arc, @@ -276,7 +256,7 @@ impl XdeState { } #[repr(C)] -struct XdeDev { +pub struct XdeDev { devname: String, linkid: datalink_id_t, mh: *mut mac::mac_handle, @@ -300,8 +280,13 @@ struct XdeDev { // These are clones of the underlay ports initialized by the // driver. - u1: Arc, - u2: Arc, + pub u1: Arc, + pub u2: Arc, + + // We make this a per-port cache rather than sharing between all + // ports to theoretically reduce contention around route expiry + // and reinsertion. + routes: RouteCache, } #[cfg(not(test))] @@ -695,6 +680,7 @@ fn create_xde(req: &CreateXdeReq) -> Result { passthrough: req.passthrough, u1: underlay.u1.clone(), u2: underlay.u2.clone(), + routes: RouteCache::default(), }); drop(underlay_); @@ -1421,33 +1407,6 @@ fn guest_loopback( ptr::null_mut() } -// XXX: clean this up, after measuring. -// XXX: completely arbitrary timeout. -const MAX_ROUTE_LIFETIME: Duration = Duration::from_millis(100); -static ROUTE_CACHE: KRwLock< - alloc::collections::BTreeMap, -> = KRwLock::new(alloc::collections::BTreeMap::new()); -#[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] -struct RouteKey { - dst: Ipv6Addr, - l4_hash: Option, -} - -#[derive(Copy, Clone, Debug)] -struct CachedRoute { - src: EtherAddr, - dst: EtherAddr, - underlay_idx: u8, - timestamp: Moment, -} - -impl CachedRoute { - pub fn is_still_valid(&self, t: Moment) -> bool { - t.delta_as_millis(self.timestamp) - <= MAX_ROUTE_LIFETIME.as_millis() as u64 - } -} - #[no_mangle] unsafe extern "C" fn xde_mc_tx( arg: *mut c_void, @@ -1570,73 +1529,9 @@ unsafe extern "C" fn xde_mc_tx( // results for a given dst + entropy. These have a fairly tight // expiry so that we can actually react to new reachability/load // info from DDM. - // XXX: this is an absolute mess. let my_key = RouteKey { dst: ip6.dst, l4_hash: meta.l4_hash() }; - let mut maybe_route = { - let route_cache = ROUTE_CACHE.read(); - route_cache.get(&my_key).copied() - }; - let t = Moment::now(); - - let recompute = match maybe_route { - None => true, - Some(route) => !route.is_still_valid(t), - }; - let (src, dst, underlay_dev) = if recompute { - let mut route_cache = ROUTE_CACHE.write(); - // Someone else may have written while we were - // taking the lock. DO NOT waste time if it's a - // good route. - maybe_route = route_cache.get(&my_key).copied(); - let recompute = match maybe_route { - None => true, - Some(route) => !route.is_still_valid(t), - }; - if recompute { - let (src, dst, underlay_dev) = - next_hop(&ip6.dst, src_dev, my_key.l4_hash); - - // This is terrible. - let underlay_idx = if underlay_dev as *const _ - == (&*src_dev.u1) as *const _ - { - 0 - } else { - 1 - }; - - route_cache.insert( - my_key, - CachedRoute { src, dst, underlay_idx, timestamp: t }, - ); - - (src, dst, underlay_dev) - } else { - let x = maybe_route.unwrap(); - - ( - x.src, - x.dst, - if x.underlay_idx == 0 { - &*src_dev.u1 - } else { - &*src_dev.u2 - }, - ) - } - } else { - let x = maybe_route.unwrap(); - - ( - x.src, - x.dst, - if x.underlay_idx == 0 { - &*src_dev.u1 - } else { - &*src_dev.u2 - }, - ) - }; + let Route { src, dst, underlay_dev } = + src_dev.routes.next_hop(my_key, src_dev); // Get a pointer to the beginning of the outer frame and // fill in the dst/src addresses before sending out the @@ -1677,7 +1572,7 @@ unsafe extern "C" fn xde_mc_tx( /// This is a generic wrapper for references that should be dropped once not in /// use. -struct DropRef +pub(crate) struct DropRef where DropFn: Fn(*mut Arg), { @@ -1693,12 +1588,12 @@ where { /// Create a new `DropRef` for the provided reference argument. When this /// object is dropped, the provided `func` will be called. - fn new(func: DropFn, arg: *mut Arg) -> Self { + pub(crate) fn new(func: DropFn, arg: *mut Arg) -> Self { Self { func, arg } } /// Return a pointer to the underlying reference. - fn inner(&self) -> *mut Arg { + pub(crate) fn inner(&self) -> *mut Arg { self.arg } } @@ -1715,344 +1610,6 @@ where } } -// The following are wrappers for reference drop functions used in XDE. - -fn ire_refrele(ire: *mut ip::ire_t) { - unsafe { ip::ire_refrele(ire) } -} - -fn nce_refrele(ire: *mut ip::nce_t) { - unsafe { ip::nce_refrele(ire) } -} - -fn netstack_rele(ns: *mut ip::netstack_t) { - unsafe { ip::netstack_rele(ns) } -} - -// At this point the core engine of OPTE has delivered a Geneve -// encapsulated guest Ethernet Frame (also simply referred to as "the -// packet") to xde to be sent to the specific outer IPv6 destination -// address. This packet includes the outer Ethernet Frame as well; -// however, the outer frame's destination and source addresses are set -// to zero. It is the job of this function to determine what those -// values should be. -// -// Adjacent to xde is the native IPv6 stack along with its routing -// table. This table is routinely updated to indicate the best path to -// any given IPv6 destination that may be specified in the outer IP -// header. As xde is not utilizing the native IPv6 stack to send out -// the packet, but rather is handing it directly to the mac module, it -// must somehow query the native routing table to determine which port -// this packet should egress and fill in the outer frame accordingly. -// This query is done via a private interface which allows a kernel -// module outside of IP to query the routing table. -// -// This process happens in a sequence of steps described below. -// -// 1. With an IPv6 destination in hand we need to determine the next -// hop, also known as the gateway, for this address. That is, of -// our neighbors (in this case one of the two switches, which are -// also acting as routers), who should we forward this packet to in -// order for it to arrive at its destination? We get this -// information from the routing table, which contains Internet -// Routing Entries, or IREs. Specifically, we query the native IPv6 -// routing table using the kernel function -// `ire_ftable_lookup_simple_v6()`. This function returns an -// `ire_t`, which includes the member `ire_u`, which contains the -// address of the gateway as `ire6_gateway_addr`. -// -// 2. We have the gateway IPv6 address; but in the world of the Oxide -// Network that is not enough to deliver the packet. In the Oxide -// Network the router (switch) is not a member of the host's -// network. Instead, we rely on link-local addresses to reach the -// switches. The lookup in step (1) gave us that link-local address -// of the gateway; now we need to figure out how to reach it. That -// requires consulting the routing table a second time: this time -// to find the IRE for the gateway's link-local address. -// -// 3. The IRE of the link-local address from step (2) allows us to -// determine which interface this traffic should traverse. -// Specifically it gives us access to the `ill_t` of the gateway's -// link-local address. This structure contains the IP Lower Level -// information. In particular it contains the `ill_phys_addr` -// which gives us the source MAC address for our outer frame. -// -// 4. The final piece of information to obtain is the destination MAC -// address. We have the link-local address of the switch port we -// want to send to. To get the MAC address of this port it must -// first be assumed that the host and its connected switches have -// performed NDP in order to learn each other's IPv6 addresses and -// corresponding MAC addresses. With that information in hand it is -// a matter of querying the kernel's Neighbor Cache Entry Table -// (NCE) for the mapping that belongs to our gateway's link-local -// address. This is done via the `nce_lookup_v6()` kernel function. -// -// With those four steps we have obtained the source and destination -// MAC addresses and the packet can be sent to mac to be delivered to -// the underlying NIC. However, the careful reader may find themselves -// confused about how step (1) actually works. -// -// If step (1) always returns a single gateway, then how do we -// actually utilize both NICs/switches? -// -// This is where a bit of knowledge about routing tables comes into -// play along with our very own Delay Driven Multipath in-rack routing -// protocol. You might imagine the IPv6 routing table on an Oxide Sled -// looking something like this. -// -// Destination/Mask Gateway Flags If -// ---------------- ------------------------- ----- --------- -// default fe80:: UG cxgbe0 -// default fe80:: UG cxgbe1 -// fe80::/10 fe80:: U cxgbe0 -// fe80::/10 fe80:: U cxgbe1 -// fd00:::/64 fe80:: U cxgbe0 -// fd00:::/64 fe80:: U cxgbe1 -// -// Let's say this host (sled1) wants to send a packet to sled2. Our -// sled1 host lives on network `fd00:::/64` while our -// sled2 host lives on `fd00:::/64` -- the key point -// being they are two different networks and thus must be routed to -// talk to each other. For sled1 to send this packet it will attempt -// to look up destination `fd00:::7777` (in this case -// `7777` is the IP of sled2) in the routing table above. The routing -// table will then perform a longest prefix match against the -// `Destination` field for all entries: the longest prefix that -// matches wins and that entry is returned. However, in this case, no -// destinations match except for the `default` ones. When more than -// one entry matches it is left to the system to decide which one to -// return; typically this just means the first one that matches. But -// not for us! This is where DDM comes into play. -// -// Let's reimagine the routing table again, this time with a -// probability added to each gateway entry. -// -// Destination/Mask Gateway Flags If P -// ---------------- ------------------------- ----- ------- ---- -// default fe80:: UG cxgbe0 0.70 -// default fe80:: UG cxgbe1 0.30 -// fe80::/10 fe80:: U cxgbe0 -// fe80::/10 fe80:: U cxgbe1 -// fd00:::/64 fe80:: U cxgbe0 -// fd00:::/64 fe80:: U cxgbe1 -// -// With these P values added we now have a new option for deciding -// which IRE to return when faced with two matches: give each a -// probability of return based on their P value. In this case, for any -// given gateway IRE lookup, there would be a 70% chance -// `fe80::` is returned and a 30% chance `fe80::` is -// returned. -// -// But wait, what determines those P values? That's the job of DDM. -// The full story of what DDM is and how it works is outside the scope -// of this already long block comment; but suffice to say it monitors -// the flow of the network based on precise latency measurements and -// with that data constantly refines the P values of all the hosts's -// routing tables to bias new packets towards one path or another. -#[no_mangle] -fn next_hop<'a>( - ip6_dst: &Ipv6Addr, - ustate: &'a XdeDev, - overlay_hash: Option, -) -> (EtherAddr, EtherAddr, &'a xde_underlay_port) { - unsafe { - // Use the GZ's routing table. - let netstack = - DropRef::new(netstack_rele, ip::netstack_find_by_zoneid(0)); - assert!(!netstack.inner().is_null()); - let ipst = (*netstack.inner()).netstack_u.nu_s.nu_ip; - assert!(!ipst.is_null()); - - let addr = ip::in6_addr_t { - _S6_un: ip::in6_addr__bindgen_ty_1 { _S6_u8: ip6_dst.bytes() }, - }; - let xmit_hint = overlay_hash.unwrap_or(0); - let mut generation_op = 0u32; - - let mut underlay_port = &*ustate.u1; - - // Step (1): Lookup the IRE for the destination. This is going - // to return one of the default gateway entries. - let ire = DropRef::new( - ire_refrele, - ip::ire_ftable_lookup_v6( - &addr, - ptr::null(), - ptr::null(), - 0, - ptr::null_mut(), - sys::ALL_ZONES, - ptr::null(), - 0, - xmit_hint, - ipst, - &mut generation_op as *mut ip::uint_t, - ), - ); - - // TODO If there is no entry should we return host - // unreachable? I'm not sure since really the guest would map - // that with its VPC network. That is, if a user saw host - // unreachable they would be correct to think that their VPC - // routing table is misconfigured, but in reality it would be - // an underlay network issue. How do we convey this situation - // to the user/operator? - if ire.inner().is_null() { - // Try without a pinned ill - opte::engine::dbg!("no IRE for destination {:?}", ip6_dst); - next_hop_probe( - ip6_dst, - None, - EtherAddr::zero(), - EtherAddr::zero(), - b"no IRE for destination\0", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - let ill = (*ire.inner()).ire_ill; - if ill.is_null() { - opte::engine::dbg!("destination ILL is NULL for {:?}", ip6_dst); - next_hop_probe( - ip6_dst, - None, - EtherAddr::zero(), - EtherAddr::zero(), - b"destination ILL is NULL\0", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - - // Step (2): Lookup the IRE for the gateway's link-local - // address. This is going to return one of the `fe80::/10` - // entries. - let ireu = (*ire.inner()).ire_u; - let gw = ireu.ire6_u.ire6_gateway_addr; - let gw_ip6 = Ipv6Addr::from(&ireu.ire6_u.ire6_gateway_addr); - - // NOTE: specifying the ill is important here, because the gateway - // address is going to be of the form fe80::. This means a - // simple query that does not specify an ill could come back with any - // route matching fe80::/10 over any interface. Since all interfaces - // that have an IPv6 link-local address assigned have an associated - // fe80::/10 route, we must restrict our search to the interface that - // actually has a route to the desired (non-link-local) destination. - let flags = ip::MATCH_IRE_ILL as i32; - let gw_ire = DropRef::new( - ire_refrele, - ip::ire_ftable_lookup_v6( - &gw, - ptr::null(), - ptr::null(), - 0, - ill, - sys::ALL_ZONES, - ptr::null(), - flags, - xmit_hint, - ipst, - &mut generation_op as *mut ip::uint_t, - ), - ); - - if gw_ire.inner().is_null() { - opte::engine::dbg!("no IRE for gateway {:?}", gw_ip6); - next_hop_probe( - ip6_dst, - Some(&gw_ip6), - EtherAddr::zero(), - EtherAddr::zero(), - b"no IRE for gateway\0", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - - // Step (3): Determine the source address of the outer frame - // from the physical address of the IP Lower Layer object - // member or the internet routing entry. - let src = (*ill).ill_phys_addr; - if src.is_null() { - opte::engine::dbg!( - "gateway ILL phys addr is NULL for {:?}", - gw_ip6 - ); - next_hop_probe( - ip6_dst, - Some(&gw_ip6), - EtherAddr::zero(), - EtherAddr::zero(), - b"gateway ILL phys addr is NULL\0", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - - let src: [u8; 6] = alloc::slice::from_raw_parts(src, 6) - .try_into() - .expect("src mac from pointer"); - - // Switch to the 2nd underlay device if we determine the source mac - // belongs to that device. - if src == ustate.u2.mac { - underlay_port = &ustate.u2; - } - - let src = EtherAddr::from(src); - - // Step (4): Determine the destination address of the outer - // frame by retrieving the NCE entry for the gateway's - // link-local address. - let nce = DropRef::new(nce_refrele, ip::nce_lookup_v6(ill, &gw)); - if nce.inner().is_null() { - opte::engine::dbg!("no NCE for gateway {:?}", gw_ip6); - next_hop_probe( - ip6_dst, - Some(&gw_ip6), - src, - EtherAddr::zero(), - b"no NCE for gateway\0", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - - let nce_common = (*nce.inner()).nce_common; - if nce_common.is_null() { - opte::engine::dbg!("no NCE common for gateway {:?}", gw_ip6); - next_hop_probe( - ip6_dst, - Some(&gw_ip6), - src, - EtherAddr::zero(), - b"no NCE common for gateway\0", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - - let mac = (*nce_common).ncec_lladdr; - if mac.is_null() { - opte::engine::dbg!("NCE MAC address is NULL {:?}", gw_ip6); - next_hop_probe( - ip6_dst, - Some(&gw_ip6), - src, - EtherAddr::zero(), - b"NCE MAC address if NULL for gateway\0", - ); - return (EtherAddr::zero(), EtherAddr::zero(), underlay_port); - } - - let maclen = (*nce_common).ncec_lladdr_length; - assert!(maclen == 6); - - let dst: [u8; 6] = alloc::slice::from_raw_parts(mac, 6) - .try_into() - .expect("mac from pointer"); - let dst = EtherAddr::from(dst); - - next_hop_probe(ip6_dst, Some(&gw_ip6), src, dst, b"\0"); - - (src, dst, underlay_port) - } -} - #[no_mangle] unsafe extern "C" fn xde_mc_getcapab( _arg: *mut c_void, From eb85463b1b4f6d82a9c7ddcfbb68269fdbfb301e Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 19 Apr 2024 12:03:20 +0100 Subject: [PATCH 04/13] Use the correct reference matching function. --- xde/src/route.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xde/src/route.rs b/xde/src/route.rs index 8a4ed54d..c04db746 100644 --- a/xde/src/route.rs +++ b/xde/src/route.rs @@ -493,7 +493,7 @@ impl Route<'_> { // As unfortunate as `into_route`. let port_0: &xde_underlay_port = &xde.u1; let underlay_idx = - if core::ptr::addr_eq(self.underlay_dev, port_0) { 0 } else { 1 }; + if core::ptr::eq(self.underlay_dev, port_0) { 0 } else { 1 }; CachedRoute { src: self.src, dst: self.dst, underlay_idx, timestamp } } From 653f8e6da918f78c9764398e23686dfb2ca10d5b Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 19 Apr 2024 12:28:02 +0100 Subject: [PATCH 05/13] Add in a period task for cached route expiry. I think this is necessary in principle because we're including hash information in the routing key, and I'd rather not have this state balloon out of control. --- xde/src/route.rs | 38 ++++++++++++++++++++++++++------------ xde/src/xde.rs | 18 +++++++++++++++++- 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/xde/src/route.rs b/xde/src/route.rs index c04db746..0b13e029 100644 --- a/xde/src/route.rs +++ b/xde/src/route.rs @@ -187,7 +187,7 @@ fn next_hop<'a>( key: &RouteKey, ustate: &'a XdeDev, ) -> Result, &'a xde_underlay_port> { - let RouteKey { dst: ip6_dst, l4_hash: overlay_hash } = key; + let RouteKey { dst: ip6_dst, l4_hash } = key; unsafe { // Use the GZ's routing table. let netstack = @@ -199,10 +199,10 @@ fn next_hop<'a>( let addr = ip::in6_addr_t { _S6_un: ip::in6_addr__bindgen_ty_1 { _S6_u8: key.dst.bytes() }, }; - let xmit_hint = overlay_hash.unwrap_or(0); + let xmit_hint = l4_hash.unwrap_or(0); let mut generation_op = 0u32; - let mut underlay_port = &*ustate.u1; + let mut underlay_dev = &*ustate.u1; // Step (1): Lookup the IRE for the destination. This is going // to return one of the default gateway entries. @@ -240,7 +240,7 @@ fn next_hop<'a>( EtherAddr::zero(), b"no IRE for destination\0", ); - return Err(underlay_port); + return Err(underlay_dev); } let ill = (*ire.inner()).ire_ill; if ill.is_null() { @@ -252,7 +252,7 @@ fn next_hop<'a>( EtherAddr::zero(), b"destination ILL is NULL\0", ); - return Err(underlay_port); + return Err(underlay_dev); } // Step (2): Lookup the IRE for the gateway's link-local @@ -296,7 +296,7 @@ fn next_hop<'a>( EtherAddr::zero(), b"no IRE for gateway\0", ); - return Err(underlay_port); + return Err(underlay_dev); } // Step (3): Determine the source address of the outer frame @@ -315,7 +315,7 @@ fn next_hop<'a>( EtherAddr::zero(), b"gateway ILL phys addr is NULL\0", ); - return Err(underlay_port); + return Err(underlay_dev); } let src: [u8; 6] = alloc::slice::from_raw_parts(src, 6) @@ -325,7 +325,7 @@ fn next_hop<'a>( // Switch to the 2nd underlay device if we determine the source mac // belongs to that device. if src == ustate.u2.mac { - underlay_port = &ustate.u2; + underlay_dev = &ustate.u2; } let src = EtherAddr::from(src); @@ -343,7 +343,7 @@ fn next_hop<'a>( EtherAddr::zero(), b"no NCE for gateway\0", ); - return Err(underlay_port); + return Err(underlay_dev); } let nce_common = (*nce.inner()).nce_common; @@ -356,7 +356,7 @@ fn next_hop<'a>( EtherAddr::zero(), b"no NCE common for gateway\0", ); - return Err(underlay_port); + return Err(underlay_dev); } let mac = (*nce_common).ncec_lladdr; @@ -369,7 +369,7 @@ fn next_hop<'a>( EtherAddr::zero(), b"NCE MAC address if NULL for gateway\0", ); - return Err(underlay_port); + return Err(underlay_dev); } let maclen = (*nce_common).ncec_lladdr_length; @@ -382,11 +382,12 @@ fn next_hop<'a>( next_hop_probe(ip6_dst, Some(&gw_ip6), src, dst, b"\0"); - Ok(Route { src, dst, underlay_dev: underlay_port }) + Ok(Route { src, dst, underlay_dev }) } } /// A simple caching layer over `next_hop`. +#[derive(Clone)] pub struct RouteCache(Arc>>); impl Default for RouteCache { @@ -398,6 +399,11 @@ impl Default for RouteCache { } impl RouteCache { + /// Retrieve a [`Route`] (device and L2 information) for a given `key`. + /// + /// This will retrieve an existing entry, if one exists from a recent + /// query, or computes the current route using `next_hop` on miss or + /// discovery of a stale entry. pub fn next_hop<'b>(&self, key: RouteKey, xde: &'b XdeDev) -> Route<'b> { let t = Moment::now(); @@ -441,6 +447,14 @@ impl RouteCache { }, } } + + /// Discards any cached route entries which have expired. + pub fn expire_routes(&self) { + let mut route_cache = self.0.write(); + + let t = Moment::now(); + route_cache.retain(|_, v| v.is_still_valid(t)); + } } /// An underlay routing destination and flow-dependent entropy. diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 727f67b4..08b20feb 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -287,6 +287,7 @@ pub struct XdeDev { // ports to theoretically reduce contention around route expiry // and reinsertion. routes: RouteCache, + routes_periodic: Periodic, } #[cfg(not(test))] @@ -591,6 +592,11 @@ fn expire_periodic(port: &mut Arc>) { let _ = port.expire_flows(); } +#[no_mangle] +fn expire_route_cache(routes: &mut RouteCache) { + routes.expire_routes() +} + #[no_mangle] fn create_xde(req: &CreateXdeReq) -> Result { // TODO name validation @@ -667,6 +673,15 @@ fn create_xde(req: &CreateXdeReq) -> Result { ONE_SECOND, ); + let routes = RouteCache::default(); + + let routes_periodic = Periodic::new( + port.name_cstr().clone(), + expire_route_cache, + Box::new(routes.clone()), + ONE_SECOND, + ); + let mut xde = Box::new(XdeDev { devname: req.xde_devname.clone(), linkid: req.linkid, @@ -680,7 +695,8 @@ fn create_xde(req: &CreateXdeReq) -> Result { passthrough: req.passthrough, u1: underlay.u1.clone(), u2: underlay.u2.clone(), - routes: RouteCache::default(), + routes, + routes_periodic, }); drop(underlay_); From 5c4a8a768eee0d3b054bb1b7a8d8ac0478780aeb Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 26 Apr 2024 20:03:48 +0100 Subject: [PATCH 06/13] Review discussion: Cap size of routing cache. This also includes more details on the relative costs we're trying to offset, and the futur ease for impling/finding a suitable hashmap. --- xde/src/lib.rs | 2 +- xde/src/route.rs | 59 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 5 deletions(-) diff --git a/xde/src/lib.rs b/xde/src/lib.rs index e6026582..d7ff6301 100644 --- a/xde/src/lib.rs +++ b/xde/src/lib.rs @@ -2,7 +2,7 @@ // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at https://mozilla.org/MPL/2.0/. -// Copyright 2022 Oxide Computer Company +// Copyright 2024 Oxide Computer Company // xde - A mac provider for OPTE-based network implementations. #![feature(extern_types)] diff --git a/xde/src/route.rs b/xde/src/route.rs index 0b13e029..8e9e92ea 100644 --- a/xde/src/route.rs +++ b/xde/src/route.rs @@ -1,3 +1,9 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +// Copyright 2024 Oxide Computer Company + use crate::ip; use crate::sys; use crate::xde::xde_underlay_port; @@ -18,6 +24,9 @@ use opte::engine::ip6::Ipv6Addr; /// The duration a cached route remains valid for. const MAX_ROUTE_LIFETIME: Duration = Duration::from_millis(100); +/// Maximum cache size, set to prevent excessive map modification latency. +const MAX_CACHE_ENTRIES: usize = 512; + extern "C" { pub fn __dtrace_probe_next__hop( dst: uintptr_t, @@ -144,7 +153,7 @@ fn netstack_rele(ns: *mut ip::netstack_t) { // // Let's say this host (sled1) wants to send a packet to sled2. Our // sled1 host lives on network `fd00:::/64` while our -// sled2 host lives on `fd00:::/64` -- the key point +// sled2 host lives on `fd00:::/64` -- the key point // being they are two different networks and thus must be routed to // talk to each other. For sled1 to send this packet it will attempt // to look up destination `fd00:::7777` (in this case @@ -387,6 +396,35 @@ fn next_hop<'a>( } /// A simple caching layer over `next_hop`. +/// +/// [`next_hop`] has a latency distribution which roughly looks like this: +/// ```text +/// t(ns) Count +/// 1024 | 337 +/// 1280 | 108 +/// 1536 |@@@@@@@@@@@@@@@@@@@@@ 376883 +/// 1792 |@@@@@@@@@@@@@@@ 264693 +/// 2048 |@ 17798 +/// 2304 |@ 14791 +/// 2560 |@@ 32901 +/// 2816 |@ 10730 +/// 3072 | 3459 +/// ``` +/// +/// Naturally, bringing this down to O(ns) is desirable. Usually, illumos +/// holds `ire_t`s per `conn_t`, but we're aiming to be more fine-grained +/// with DDM -- so we need a tradeoff between 'asking about the best route +/// per-packet' and 'holding a route until it is expired'. We choose, for noe, +/// to hold a route for 100ms. +/// +/// Note, this uses a `BTreeMap`, but we would prefer the more consistent +/// (faster) add/remove costs of a `HashMap`. As `BTreeMap` modification costs +/// outpace the cost of `next_hop` between 256--512 entries, we currently set 512 +/// as a cap on cache size to prevent significant packet stalls. This may be tricky +/// to tune. +/// +/// (See: https://github.com/oxidecomputer/opte/pull/499#discussion_r1581164767 +/// for some performance numbers.) #[derive(Clone)] pub struct RouteCache(Arc>>); @@ -432,15 +470,28 @@ impl RouteCache { _ => {} } + // We've had a definitive flow miss, but we need to cap the cache + // size to prevent excessive modification latencies at high flow + // counts. + // If full and we have no old entry to update, drop the lock and do + // not insert. + // XXX: Want to profile in future to see if LRU expiry is + // affordable/sane here. + // XXX: A HashMap would exchange insert cost for lookup. + let route_cache = (maybe_route.is_some() + || route_cache.len() <= MAX_CACHE_ENTRIES) + .then_some(route_cache); + // `next_hop` might fail for myriad reasons, but we still // send the packet on an underlay device depending on our // progress. However, we do not want to cache bad mappings. - match next_hop(&key, xde) { - Ok(route) => { + match (route_cache, next_hop(&key, xde)) { + (Some(mut route_cache), Ok(route)) => { route_cache.insert(key, route.cached(xde, t)); route } - Err(underlay_dev) => Route { + (None, Ok(route)) => route, + (_, Err(underlay_dev)) => Route { src: EtherAddr::zero(), dst: EtherAddr::zero(), underlay_dev, From 3230a52ded6e8a29163b4f8afc603baec3fadbb9 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 26 Apr 2024 22:37:14 +0100 Subject: [PATCH 07/13] Separate route removal from route expiry. This places 'requesting a new IRE from illumos' and 'removing the allocated slot in the RouteCache` onto two different timers. This should allow a flow to keep reusing its existing slot without needing to redo an expensive insert incurred after a removal. --- xde/src/route.rs | 40 +++++++++++++++++++++++++--------------- xde/src/xde.rs | 2 +- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/xde/src/route.rs b/xde/src/route.rs index 8e9e92ea..d6d9a485 100644 --- a/xde/src/route.rs +++ b/xde/src/route.rs @@ -20,9 +20,17 @@ use opte::ddi::time::Moment; use opte::engine::ether::EtherAddr; use opte::engine::ip6::Ipv6Addr; -// XXX: completely arbitrary timeout. -/// The duration a cached route remains valid for. -const MAX_ROUTE_LIFETIME: Duration = Duration::from_millis(100); +// XXX: completely arbitrary timeouts. +/// The duration a cached route remains valid for before it must be +/// refreshed. +/// +/// Expired routes will not be removed from the cache, and will leave +/// an entry to enable a quick in-place refresh in the `BTreeMap`. +const EXPIRE_ROUTE_LIFETIME: Duration = Duration::from_millis(100); + +/// The time after which a route should be completely removed from +/// the cache. +const REMOVE_ROUTE_LIFETIME: Duration = Duration::from_millis(1000); /// Maximum cache size, set to prevent excessive map modification latency. const MAX_CACHE_ENTRIES: usize = 512; @@ -451,22 +459,18 @@ impl RouteCache { }; match maybe_route { - Some(route) if route.is_still_valid(t) => { - return route.into_route(xde) - } + Some(route) if route.is_valid(t) => return route.into_route(xde), _ => {} } // Cache miss: intent is to now ask illumos, then insert. - let mut route_cache = self.0.write(); + let route_cache = self.0.write(); // Someone else may have written while we were taking the lock. // DO NOT waste time if there's a good route. let maybe_route = route_cache.get(&key).copied(); match maybe_route { - Some(route) if route.is_still_valid(t) => { - return route.into_route(xde) - } + Some(route) if route.is_valid(t) => return route.into_route(xde), _ => {} } @@ -499,12 +503,13 @@ impl RouteCache { } } - /// Discards any cached route entries which have expired. - pub fn expire_routes(&self) { + /// Discards any cached route entries which have been present + /// for longer than `REMOVE_ROUTE_LIFETIME`. + pub fn remove_routes(&self) { let mut route_cache = self.0.write(); let t = Moment::now(); - route_cache.retain(|_, v| v.is_still_valid(t)); + route_cache.retain(|_, v| v.is_retained(t)); } } @@ -525,9 +530,14 @@ pub struct CachedRoute { } impl CachedRoute { - fn is_still_valid(&self, t: Moment) -> bool { + fn is_valid(&self, t: Moment) -> bool { + t.delta_as_millis(self.timestamp) + <= EXPIRE_ROUTE_LIFETIME.as_millis() as u64 + } + + fn is_retained(&self, t: Moment) -> bool { t.delta_as_millis(self.timestamp) - <= MAX_ROUTE_LIFETIME.as_millis() as u64 + <= REMOVE_ROUTE_LIFETIME.as_millis() as u64 } fn into_route(self, xde: &XdeDev) -> Route<'_> { diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 87553a28..0d3d01e4 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -604,7 +604,7 @@ fn expire_periodic(port: &mut Arc>) { #[no_mangle] fn expire_route_cache(routes: &mut RouteCache) { - routes.expire_routes() + routes.remove_routes() } #[no_mangle] From efea9aaa7a70952f6ddfb3e5c1427b051389fb29 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Fri, 26 Apr 2024 23:42:23 +0100 Subject: [PATCH 08/13] Use `Entry` to trim one lookup on cache entry refresh. --- xde/src/route.rs | 60 +++++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/xde/src/route.rs b/xde/src/route.rs index d6d9a485..7dc6c347 100644 --- a/xde/src/route.rs +++ b/xde/src/route.rs @@ -9,6 +9,7 @@ use crate::sys; use crate::xde::xde_underlay_port; use crate::xde::DropRef; use crate::xde::XdeDev; +use alloc::collections::btree_map::Entry; use alloc::collections::BTreeMap; use alloc::sync::Arc; use core::ptr; @@ -464,15 +465,19 @@ impl RouteCache { } // Cache miss: intent is to now ask illumos, then insert. - let route_cache = self.0.write(); + let mut route_cache = self.0.write(); + let space_remaining = route_cache.len() < MAX_CACHE_ENTRIES; // Someone else may have written while we were taking the lock. // DO NOT waste time if there's a good route. - let maybe_route = route_cache.get(&key).copied(); - match maybe_route { - Some(route) if route.is_valid(t) => return route.into_route(xde), - _ => {} - } + let maybe_route = route_cache.entry(key); + let entry_exists = match &maybe_route { + Entry::Occupied(e) if e.get().is_valid(t) => { + return e.get().into_route(xde); + } + Entry::Occupied(_) => true, + _ => false, + }; // We've had a definitive flow miss, but we need to cap the cache // size to prevent excessive modification latencies at high flow @@ -482,24 +487,27 @@ impl RouteCache { // XXX: Want to profile in future to see if LRU expiry is // affordable/sane here. // XXX: A HashMap would exchange insert cost for lookup. - let route_cache = (maybe_route.is_some() - || route_cache.len() <= MAX_CACHE_ENTRIES) - .then_some(route_cache); - - // `next_hop` might fail for myriad reasons, but we still - // send the packet on an underlay device depending on our - // progress. However, we do not want to cache bad mappings. - match (route_cache, next_hop(&key, xde)) { - (Some(mut route_cache), Ok(route)) => { - route_cache.insert(key, route.cached(xde, t)); - route + if entry_exists || space_remaining { + // `next_hop` might fail for myriad reasons, but we still + // send the packet on an underlay device depending on our + // progress. However, we do not want to cache bad mappings. + match (maybe_route, next_hop(&key, xde)) { + (Entry::Vacant(slot), Ok(route)) => { + slot.insert(route.cached(xde, t)); + route + } + (Entry::Occupied(mut slot), Ok(route)) => { + slot.insert(route.cached(xde, t)); + route + } + (_, Err(dev)) => Route::zero_addr(dev), + } + } else { + drop(route_cache); + match next_hop(&key, xde) { + Ok(route) => route, + Err(dev) => Route::zero_addr(dev), } - (None, Ok(route)) => route, - (_, Err(underlay_dev)) => Route { - src: EtherAddr::zero(), - dst: EtherAddr::zero(), - underlay_dev, - }, } } @@ -563,7 +571,7 @@ pub struct Route<'a> { pub underlay_dev: &'a xde_underlay_port, } -impl Route<'_> { +impl<'a> Route<'a> { fn cached(&self, xde: &XdeDev, timestamp: Moment) -> CachedRoute { // As unfortunate as `into_route`. let port_0: &xde_underlay_port = &xde.u1; @@ -572,4 +580,8 @@ impl Route<'_> { CachedRoute { src: self.src, dst: self.dst, underlay_idx, timestamp } } + + fn zero_addr(underlay_dev: &'a xde_underlay_port) -> Route<'a> { + Self { src: EtherAddr::zero(), dst: EtherAddr::zero(), underlay_dev } + } } From 3c9e2ad8f522c9976271af7d019ce9d3a8410c59 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Tue, 21 May 2024 19:16:33 +0100 Subject: [PATCH 09/13] CStrs instead of b""s. --- xde/src/route.rs | 23 ++++++++++++----------- xde/src/xde.rs | 7 ------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/xde/src/route.rs b/xde/src/route.rs index 7dc6c347..c3090378 100644 --- a/xde/src/route.rs +++ b/xde/src/route.rs @@ -12,6 +12,7 @@ use crate::xde::XdeDev; use alloc::collections::btree_map::Entry; use alloc::collections::BTreeMap; use alloc::sync::Arc; +use core::ffi::CStr; use core::ptr; use core::time::Duration; use illumos_sys_hdrs::*; @@ -42,7 +43,7 @@ extern "C" { gw: uintptr_t, gw_ether_src: uintptr_t, gw_ether_dst: uintptr_t, - msg: uintptr_t, + msg: *const c_char, ); } @@ -51,7 +52,7 @@ fn next_hop_probe( gw: Option<&Ipv6Addr>, gw_eth_src: EtherAddr, gw_eth_dst: EtherAddr, - msg: &[u8], + msg: &CStr, ) { let gw_bytes = gw.unwrap_or(&Ipv6Addr::from([0u8; 16])).bytes(); @@ -61,7 +62,7 @@ fn next_hop_probe( gw_bytes.as_ptr() as uintptr_t, gw_eth_src.to_bytes().as_ptr() as uintptr_t, gw_eth_dst.to_bytes().as_ptr() as uintptr_t, - msg.as_ptr() as uintptr_t, + msg.as_ptr(), ); } } @@ -256,7 +257,7 @@ fn next_hop<'a>( None, EtherAddr::zero(), EtherAddr::zero(), - b"no IRE for destination\0", + c"no IRE for destination", ); return Err(underlay_dev); } @@ -268,7 +269,7 @@ fn next_hop<'a>( None, EtherAddr::zero(), EtherAddr::zero(), - b"destination ILL is NULL\0", + c"destination ILL is NULL", ); return Err(underlay_dev); } @@ -312,7 +313,7 @@ fn next_hop<'a>( Some(&gw_ip6), EtherAddr::zero(), EtherAddr::zero(), - b"no IRE for gateway\0", + c"no IRE for gateway", ); return Err(underlay_dev); } @@ -331,7 +332,7 @@ fn next_hop<'a>( Some(&gw_ip6), EtherAddr::zero(), EtherAddr::zero(), - b"gateway ILL phys addr is NULL\0", + c"gateway ILL phys addr is NULL", ); return Err(underlay_dev); } @@ -359,7 +360,7 @@ fn next_hop<'a>( Some(&gw_ip6), src, EtherAddr::zero(), - b"no NCE for gateway\0", + c"no NCE for gateway", ); return Err(underlay_dev); } @@ -372,7 +373,7 @@ fn next_hop<'a>( Some(&gw_ip6), src, EtherAddr::zero(), - b"no NCE common for gateway\0", + c"no NCE common for gateway", ); return Err(underlay_dev); } @@ -385,7 +386,7 @@ fn next_hop<'a>( Some(&gw_ip6), src, EtherAddr::zero(), - b"NCE MAC address if NULL for gateway\0", + c"NCE MAC address if NULL for gateway", ); return Err(underlay_dev); } @@ -398,7 +399,7 @@ fn next_hop<'a>( .expect("mac from pointer"); let dst = EtherAddr::from(dst); - next_hop_probe(ip6_dst, Some(&gw_ip6), src, dst, b"\0"); + next_hop_probe(ip6_dst, Some(&gw_ip6), src, dst, c""); Ok(Route { src, dst, underlay_dev }) } diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 9458d2ee..196527d8 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -143,13 +143,6 @@ extern "C" { dst_port: uintptr_t, ); pub fn __dtrace_probe_hdlr__resp(resp_str: uintptr_t); - pub fn __dtrace_probe_next__hop( - dst: uintptr_t, - gw: uintptr_t, - gw_ether_src: uintptr_t, - gw_ether_dst: uintptr_t, - msg: *const c_char, - ); pub fn __dtrace_probe_rx(mp: uintptr_t); pub fn __dtrace_probe_tx(mp: uintptr_t); } From 7d844e5b58af33f3e74bc94335998047c92fbe62 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 22 May 2024 12:49:56 +0100 Subject: [PATCH 10/13] Review feedback. --- xde/src/route.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xde/src/route.rs b/xde/src/route.rs index c3090378..9491cf5c 100644 --- a/xde/src/route.rs +++ b/xde/src/route.rs @@ -424,7 +424,7 @@ fn next_hop<'a>( /// Naturally, bringing this down to O(ns) is desirable. Usually, illumos /// holds `ire_t`s per `conn_t`, but we're aiming to be more fine-grained /// with DDM -- so we need a tradeoff between 'asking about the best route -/// per-packet' and 'holding a route until it is expired'. We choose, for noe, +/// per-packet' and 'holding a route until it is expired'. We choose, for now, /// to hold a route for 100ms. /// /// Note, this uses a `BTreeMap`, but we would prefer the more consistent @@ -540,13 +540,13 @@ pub struct CachedRoute { impl CachedRoute { fn is_valid(&self, t: Moment) -> bool { - t.delta_as_millis(self.timestamp) - <= EXPIRE_ROUTE_LIFETIME.as_millis() as u64 + u128::from(t.delta_as_millis(self.timestamp)) + <= EXPIRE_ROUTE_LIFETIME.as_millis() } fn is_retained(&self, t: Moment) -> bool { - t.delta_as_millis(self.timestamp) - <= REMOVE_ROUTE_LIFETIME.as_millis() as u64 + u128::from(t.delta_as_millis(self.timestamp)) + <= REMOVE_ROUTE_LIFETIME.as_millis() } fn into_route(self, xde: &XdeDev) -> Route<'_> { From 7ccad7ac4d2cf3e323552288b0251dd0797e7d03 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 22 May 2024 12:58:04 +0100 Subject: [PATCH 11/13] Misc. Clippy feeding. --- lib/opte/src/d_error.rs | 2 +- lib/opte/src/engine/packet.rs | 14 +++++++------- lib/opte/src/engine/rule.rs | 2 +- xde/src/xde.rs | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/opte/src/d_error.rs b/lib/opte/src/d_error.rs index a558fe97..edbe61f4 100644 --- a/lib/opte/src/d_error.rs +++ b/lib/opte/src/d_error.rs @@ -145,7 +145,7 @@ impl LabelBlock { } /// Provides access to all stored [`CStr`]s. - pub fn entries<'a>(&'a self) -> LabelBlockIter<'a, L> { + pub fn entries(&self) -> LabelBlockIter<'_, L> { LabelBlockIter { pos: 0, inner: self } } diff --git a/lib/opte/src/engine/packet.rs b/lib/opte/src/engine/packet.rs index f0c2d89c..d9ac2107 100644 --- a/lib/opte/src/engine/packet.rs +++ b/lib/opte/src/engine/packet.rs @@ -469,7 +469,7 @@ impl PacketChain { /// Removes the next packet from the top of the chain and returns /// it, taking ownership. - pub fn next(&mut self) -> Option> { + pub fn pop_front(&mut self) -> Option> { if let Some(ref mut list) = &mut self.inner { unsafe { let curr = list.head.as_ptr(); @@ -554,7 +554,7 @@ impl Drop for PacketChain { unsafe { ddi::freemsgchain(list.head.as_ptr()) }; } } else { - while let Some(pkt) = self.next() { + while let Some(pkt) = self.pop_front() { drop(pkt); } } @@ -3945,7 +3945,7 @@ mod test { fn create_linked_mblks(n: usize) -> Vec<*mut mblk_t> { let mut els = vec![]; - for i in 0..n { + for _ in 0..n { els.push(allocb(8)); } @@ -3962,7 +3962,7 @@ mod test { #[test] fn chain_has_correct_ends() { - let mut els = create_linked_mblks(3); + let els = create_linked_mblks(3); let chain = unsafe { PacketChain::new(els[0]) }.unwrap(); let chain_inner = chain.inner.as_ref().unwrap(); @@ -3976,7 +3976,7 @@ mod test { let mut chain = unsafe { PacketChain::new(els[0]) }.unwrap(); - let p0 = chain.next().unwrap(); + let p0 = chain.pop_front().unwrap(); assert_eq!(p0.mblk_addr(), els[0] as uintptr_t); unsafe { assert!((*els[0]).b_prev.is_null()); @@ -4024,10 +4024,10 @@ mod test { let mut chain = unsafe { PacketChain::new(els[0]) }.unwrap(); for i in 0..els.len() { - let pkt = chain.next().unwrap(); + let pkt = chain.pop_front().unwrap(); assert_eq!(pkt.mblk_addr(), els[i] as uintptr_t); } - assert!(chain.next().is_none()); + assert!(chain.pop_front().is_none()); } } diff --git a/lib/opte/src/engine/rule.rs b/lib/opte/src/engine/rule.rs index 4c9fee9a..881d7fde 100644 --- a/lib/opte/src/engine/rule.rs +++ b/lib/opte/src/engine/rule.rs @@ -834,7 +834,7 @@ impl<'a> Rule { #[cfg(debug_assertions)] { if let Some(preds) = &self.state.preds { - if preds.hdr_preds.len() == 0 && preds.data_preds.len() == 0 { + if preds.hdr_preds.is_empty() && preds.data_preds.is_empty() { panic!( "bug: RulePredicates must have at least one \ predicate" diff --git a/xde/src/xde.rs b/xde/src/xde.rs index 196527d8..2199fa8f 100644 --- a/xde/src/xde.rs +++ b/xde/src/xde.rs @@ -1523,7 +1523,7 @@ unsafe extern "C" fn xde_mc_tx( // by the mch they're being targeted to. E.g., either build a list // of chains (u1, u2, port0, port1, ...), or hold tx until another // packet breaks the run targeting the same dest. - while let Some(pkt) = chain.next() { + while let Some(pkt) = chain.pop_front() { xde_mc_tx_one(src_dev, pkt); } @@ -1825,7 +1825,7 @@ unsafe extern "C" fn xde_rx( // by the mch they're being targeted to. E.g., either build a list // of chains (port0, port1, ...), or hold tx until another // packet breaks the run targeting the same dest. - while let Some(pkt) = chain.next() { + while let Some(pkt) = chain.pop_front() { xde_rx_one(&mch, mrh, pkt); } } From 4c80ce151bbfbebe5c369c4943b9c36cb72a94a9 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 22 May 2024 15:05:30 +0100 Subject: [PATCH 12/13] DTrace probes on route cache events --- xde/src/route.rs | 108 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 104 insertions(+), 4 deletions(-) diff --git a/xde/src/route.rs b/xde/src/route.rs index 9491cf5c..efc9b919 100644 --- a/xde/src/route.rs +++ b/xde/src/route.rs @@ -45,6 +45,36 @@ extern "C" { gw_ether_dst: uintptr_t, msg: *const c_char, ); + + pub fn __dtrace_probe_routecache__full( + cache: uintptr_t, + dst: uintptr_t, + entropy: uintptr_t, + ); + + pub fn __dtrace_probe_routecache__hit( + cache: uintptr_t, + dst: uintptr_t, + entropy: uintptr_t, + ); + + pub fn __dtrace_probe_routecache__insert( + cache: uintptr_t, + dst: uintptr_t, + entropy: uintptr_t, + ); + + pub fn __dtrace_probe_routecache__refresh( + cache: uintptr_t, + dst: uintptr_t, + entropy: uintptr_t, + ); + + pub fn __dtrace_probe_routecache__delete( + cache: uintptr_t, + dst: uintptr_t, + entropy: uintptr_t, + ); } fn next_hop_probe( @@ -67,6 +97,56 @@ fn next_hop_probe( } } +fn route_full_probe(map: uintptr_t, key: &RouteKey) { + unsafe { + __dtrace_probe_routecache__full( + map, + key.dst.as_ptr() as uintptr_t, + key.l4_hash.unwrap_or_default() as uintptr_t, + ); + } +} + +fn route_hit_probe(map: uintptr_t, key: &RouteKey) { + unsafe { + __dtrace_probe_routecache__hit( + map, + key.dst.as_ptr() as uintptr_t, + key.l4_hash.unwrap_or_default() as uintptr_t, + ); + } +} + +fn route_insert_probe(map: uintptr_t, key: &RouteKey) { + unsafe { + __dtrace_probe_routecache__insert( + map, + key.dst.as_ptr() as uintptr_t, + key.l4_hash.unwrap_or_default() as uintptr_t, + ); + } +} + +fn route_refresh_probe(map: uintptr_t, key: &RouteKey) { + unsafe { + __dtrace_probe_routecache__refresh( + map, + key.dst.as_ptr() as uintptr_t, + key.l4_hash.unwrap_or_default() as uintptr_t, + ); + } +} + +fn route_delete_probe(map: uintptr_t, key: &RouteKey) { + unsafe { + __dtrace_probe_routecache__delete( + map, + key.dst.as_ptr() as uintptr_t, + key.l4_hash.unwrap_or_default() as uintptr_t, + ); + } +} + // The following are wrappers for reference drop functions used in XDE. fn ire_refrele(ire: *mut ip::ire_t) { @@ -455,13 +535,19 @@ impl RouteCache { pub fn next_hop<'b>(&self, key: RouteKey, xde: &'b XdeDev) -> Route<'b> { let t = Moment::now(); - let maybe_route = { + let (maybe_route, map_ptr_int) = { let route_cache = self.0.read(); - route_cache.get(&key).copied() + ( + route_cache.get(&key).copied(), + &*route_cache as *const BTreeMap<_, _> as uintptr_t, + ) }; match maybe_route { - Some(route) if route.is_valid(t) => return route.into_route(xde), + Some(route) if route.is_valid(t) => { + route_hit_probe(map_ptr_int, &key); + return route.into_route(xde); + } _ => {} } @@ -474,6 +560,7 @@ impl RouteCache { let maybe_route = route_cache.entry(key); let entry_exists = match &maybe_route { Entry::Occupied(e) if e.get().is_valid(t) => { + route_hit_probe(map_ptr_int, &key); return e.get().into_route(xde); } Entry::Occupied(_) => true, @@ -494,16 +581,19 @@ impl RouteCache { // progress. However, we do not want to cache bad mappings. match (maybe_route, next_hop(&key, xde)) { (Entry::Vacant(slot), Ok(route)) => { + route_insert_probe(map_ptr_int, &key); slot.insert(route.cached(xde, t)); route } (Entry::Occupied(mut slot), Ok(route)) => { + route_refresh_probe(map_ptr_int, &key); slot.insert(route.cached(xde, t)); route } (_, Err(dev)) => Route::zero_addr(dev), } } else { + route_full_probe(map_ptr_int, &key); drop(route_cache); match next_hop(&key, xde) { Ok(route) => route, @@ -518,7 +608,17 @@ impl RouteCache { let mut route_cache = self.0.write(); let t = Moment::now(); - route_cache.retain(|_, v| v.is_retained(t)); + let ptr: *const BTreeMap<_, _> = &*route_cache; + let map_ptr_int = ptr as uintptr_t; + + route_cache.retain(|k, v| { + if v.is_retained(t) { + true + } else { + route_delete_probe(map_ptr_int, &k); + false + } + }); } } From 157f0fdcb12c1672989965ef366b131013ff3257 Mon Sep 17 00:00:00 2001 From: Kyle Simpson Date: Wed, 22 May 2024 15:06:49 +0100 Subject: [PATCH 13/13] One more clippy. --- xde/src/route.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xde/src/route.rs b/xde/src/route.rs index efc9b919..089bc36c 100644 --- a/xde/src/route.rs +++ b/xde/src/route.rs @@ -615,7 +615,7 @@ impl RouteCache { if v.is_retained(t) { true } else { - route_delete_probe(map_ptr_int, &k); + route_delete_probe(map_ptr_int, k); false } });