Skip to content

Commit

Permalink
Merge pull request #7 from ctrox/ebpf-mem
Browse files Browse the repository at this point in the history
Improve shim memory usage
  • Loading branch information
ctrox authored May 4, 2024
2 parents 87e5935 + ccecabf commit 4a34cfd
Show file tree
Hide file tree
Showing 37 changed files with 703 additions and 297 deletions.
33 changes: 29 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: "1.21"
go-version: "1.22"

- uses: dominikh/[email protected].0
- uses: dominikh/[email protected].1
with:
install-go: false
version: "2023.1.6"

test:
Expand All @@ -25,11 +26,35 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: "1.21"
go-version: "1.22"

- name: test
run: sudo --preserve-env make test

build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true

- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: "1.22"

- name: build ebpf image
run: make build-ebpf

- name: generate ebpf
run: make generate

- name: check for diff
run: git diff --exit-code

e2e:
runs-on: ubuntu-latest
steps:
Expand All @@ -43,7 +68,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v4
with:
go-version: "1.21"
go-version: "1.22"

- name: e2e
run: make test-e2e
15 changes: 10 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ build-kind: build
docker cp containerd-shim-zeropod-v2 kind-control-plane:/opt/zeropod/bin/

install-kind: build-installer build-manager
docker exec kind-control-plane mount -t bpf bpf /sys/fs/bpf
kind load docker-image $(INSTALLER_IMAGE)
kind load docker-image $(MANAGER_IMAGE)
kubectl apply -k config/kind
Expand Down Expand Up @@ -63,15 +62,15 @@ test:
# of the host into the container. For now this is the only way to run the e2e
# tests on Mac OS with apple silicon as the shim requires GOOS=linux.
docker-test-e2e: build-test
docker run --rm -ti --privileged --network=host --rm -v $(DOCKER_SOCK):$(DOCKER_SOCK) -v $(PWD):/app $(TEST_IMAGE) make test-e2e
docker run --rm --privileged --network=host --rm -v $(DOCKER_SOCK):$(DOCKER_SOCK) -v $(PWD):/app $(TEST_IMAGE) make test-e2e

docker-bench: build-test
docker run --rm -ti --privileged --network=host --rm -v $(DOCKER_SOCK):$(DOCKER_SOCK) -v $(PWD):/app $(TEST_IMAGE) make bench
docker run --rm --privileged --network=host --rm -v $(DOCKER_SOCK):$(DOCKER_SOCK) -v $(PWD):/app $(TEST_IMAGE) make bench

# has to have SYS_ADMIN because the test tries to set netns and mount bpffs
# we use --pid=host to make the ebpf tracker work without a pid resolver
docker-test:
docker run --rm -ti --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --pid=host -v $(PWD):/app $(TEST_IMAGE) make test
docker run --rm --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --pid=host -v $(PWD):/app $(TEST_IMAGE) make test

CLANG ?= clang
CFLAGS := -O2 -g -Wall -Werror
Expand All @@ -82,4 +81,10 @@ CFLAGS := -O2 -g -Wall -Werror
generate: export BPF_CLANG := $(CLANG)
generate: export BPF_CFLAGS := $(CFLAGS)
generate:
docker run --rm -ti -v $(PWD):/app --env=BPF_CLANG="$(CLANG)" --env=BPF_CFLAGS="$(CFLAGS)" $(EBPF_IMAGE) go generate ./...
docker run --rm -v $(PWD):/app:Z --user $(shell id -u):$(shell id -g) --env=BPF_CLANG="$(CLANG)" --env=BPF_CFLAGS="$(CFLAGS)" $(EBPF_IMAGE)

# to improve reproducibility of the bpf builds, we dump the vmlinux.h and
# store it compressed in git instead of dumping it during the build.
update-vmlinux:
docker run --rm -v $(PWD):/app:Z --entrypoint /bin/sh --user $(shell id -u):$(shell id -g) $(EBPF_IMAGE) \
-c "bpftool btf dump file /sys/kernel/btf/vmlinux format c" | gzip > socket/vmlinux.h.gz
31 changes: 27 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,10 @@ class](https://kubernetes.io/docs/concepts/containers/runtime-class/), it
needs to install binaries to your cluster nodes (by default in `/opt/zeropod`)
and also configure Containerd to load the shim. If you first test this, it's
probably best to use a [kind](https://kind.sigs.k8s.io) cluster or something
similar that you can quickly setup and delete again. It uses a DaemonSet for
installing components on the node itself and also runs a `manager` component
as a second container for collecting metrics and probably other uses in the
future.
similar that you can quickly setup and delete again. It uses a DaemonSet
called `zeropod-node` for installing components on the node itself and also
runs the `manager` component for attaching the eBPF programs and collecting
metrics.

### Installation

Expand Down Expand Up @@ -257,6 +257,29 @@ zeropod.ctrox.dev/pre-dump: "true"
zeropod.ctrox.dev/disable-checkpointing: "true"
```
## zeropod-node
The zeropod-node Daemonset is scheduled on every node labelled
`zeropod.ctrox.dev/node=true`. The individual components of the node daemon
are documented in this section.

### Installer

The installer runs as an init-container and runs the binary
`cmd/installer/main.go` with some distro-specific options to install the
runtime binaries, configure containerd and register the `RuntimeClass`.

### Manager

The manager component starts after the installer init-container has succeeded.
It provides functionality that is needed on a node-level and is would bloat
the shim otherwise. For example, loading eBPF programs can be quite memory
intensive so they have been moved from the shim to the manager to keep the
shim memory usage as minimal as possible.

In addition to that it collects metrics from all the shim processes and
exposes those metrics on an HTTP endpoint.

## Metrics

The zeropod-node pod exposes metrics on `0.0.0.0:8080/metrics` in Prometheus
Expand Down
155 changes: 134 additions & 21 deletions activator/activator.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,21 @@ import (
"fmt"
"io"
"net"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"syscall"
"time"

"github.com/cilium/ebpf"
"github.com/containerd/log"
"github.com/containernetworking/plugins/pkg/ns"
)

const ()

type Server struct {
listeners []net.Listener
ports []uint16
Expand All @@ -25,41 +32,48 @@ type Server struct {
proxyCancel context.CancelFunc
ns ns.NetNS
firstAccept sync.Once
bpfCloseFunc func()
bpfObjs *bpfObjects
maps bpfMaps
sandboxPid int
started bool
}

type OnAccept func() error

func NewServer(ctx context.Context, ports []uint16, nn ns.NetNS, ifaces ...string) (*Server, error) {
if len(ifaces) == 0 {
return nil, fmt.Errorf("no interfaces have been supplied, at least one is required")
}

func NewServer(ctx context.Context, nn ns.NetNS) (*Server, error) {
s := &Server{
quit: make(chan interface{}),
ports: ports,
connectTimeout: time.Second * 5,
proxyTimeout: time.Second * 5,
ns: nn,
sandboxPid: parsePidFromNetNS(nn),
}

if err := nn.Do(func(_ ns.NetNS) error {
objs, close, err := initBPF(ifaces...)
if err != nil {
return err
}
s.bpfObjs = objs
s.bpfCloseFunc = close
return nil
}); err != nil {
return nil, err
return s, os.MkdirAll(PinPath(s.sandboxPid), os.ModePerm)
}

func parsePidFromNetNS(nn ns.NetNS) int {
parts := strings.Split(nn.Path(), "/")
if len(parts) < 3 {
return 0
}

return s, nil
pid, err := strconv.Atoi(parts[2])
if err != nil {
return 0
}

return pid
}

func (s *Server) Start(ctx context.Context, onAccept OnAccept) error {
var ErrMapNotFound = errors.New("bpf map could not be found")

func (s *Server) Start(ctx context.Context, ports []uint16, onAccept OnAccept) error {
s.ports = ports

if err := s.loadPinnedMaps(); err != nil {
return err
}

for _, port := range s.ports {
proxyPort, err := s.listen(ctx, port, onAccept)
if err != nil {
Expand All @@ -72,9 +86,14 @@ func (s *Server) Start(ctx context.Context, onAccept OnAccept) error {
}
}

s.started = true
return nil
}

func (s *Server) Started() bool {
return s.started
}

func (s *Server) Reset() error {
s.firstAccept = sync.Once{}
for _, port := range s.ports {
Expand Down Expand Up @@ -140,7 +159,9 @@ func (s *Server) Stop(ctx context.Context) {
l.Close()
}

s.bpfCloseFunc()
log.G(ctx).Debugf("removing %s", PinPath(s.sandboxPid))

_ = os.RemoveAll(PinPath(s.sandboxPid))

s.wg.Wait()
log.G(ctx).Debugf("activator stopped")
Expand Down Expand Up @@ -277,6 +298,98 @@ func (s *Server) connect(ctx context.Context, port uint16) (net.Conn, error) {
}
}

const (
activeConnectionsMap = "active_connections"
disableRedirectMap = "disable_redirect"
egressRedirectsMap = "egress_redirects"
ingressRedirectsMap = "ingress_redirects"
)

func (a *Server) loadPinnedMaps() error {
// either all or none of the maps are pinned, so we want to return
// ErrMapNotFound so it can be handled.
if _, err := os.Stat(filepath.Join(PinPath(a.sandboxPid), activeConnectionsMap)); os.IsNotExist(err) {
return ErrMapNotFound
}

var err error
opts := &ebpf.LoadPinOptions{}
if a.maps.ActiveConnections == nil {
a.maps.ActiveConnections, err = ebpf.LoadPinnedMap(a.mapPath(activeConnectionsMap), opts)
if err != nil {
return err
}
}

if a.maps.DisableRedirect == nil {
a.maps.DisableRedirect, err = ebpf.LoadPinnedMap(a.mapPath(disableRedirectMap), opts)
if err != nil {
return err
}
}

if a.maps.EgressRedirects == nil {
a.maps.EgressRedirects, err = ebpf.LoadPinnedMap(a.mapPath(egressRedirectsMap), opts)
if err != nil {
return err
}
}

if a.maps.IngressRedirects == nil {
a.maps.IngressRedirects, err = ebpf.LoadPinnedMap(a.mapPath(ingressRedirectsMap), opts)
if err != nil {
return err
}
}

return nil
}

func (a *Server) mapPath(name string) string {
return filepath.Join(PinPath(a.sandboxPid), name)
}

// RedirectPort redirects the port from to on ingress and to from on egress.
func (a *Server) RedirectPort(from, to uint16) error {
if err := a.maps.IngressRedirects.Put(&from, &to); err != nil {
return fmt.Errorf("unable to put ports %d -> %d into bpf map: %w", from, to, err)
}
if err := a.maps.EgressRedirects.Put(&to, &from); err != nil {
return fmt.Errorf("unable to put ports %d -> %d into bpf map: %w", to, from, err)
}
return nil
}

func (a *Server) registerConnection(port uint16) error {
if err := a.maps.ActiveConnections.Put(&port, uint8(1)); err != nil {
return fmt.Errorf("unable to put port %d into bpf map: %w", port, err)
}
return nil
}

func (a *Server) removeConnection(port uint16) error {
if err := a.maps.ActiveConnections.Delete(&port); err != nil {
return fmt.Errorf("unable to delete port %d in bpf map: %w", port, err)
}
return nil
}

func (a *Server) disableRedirect(port uint16) error {
if err := a.maps.DisableRedirect.Put(&port, uint8(1)); err != nil {
return fmt.Errorf("unable to put %d into bpf map: %w", port, err)
}
return nil
}

func (a *Server) enableRedirect(port uint16) error {
if err := a.maps.DisableRedirect.Delete(&port); err != nil {
if !errors.Is(err, ebpf.ErrKeyNotExist) {
return err
}
}
return nil
}

// proxy just proxies between conn1 and conn2.
func proxy(ctx context.Context, conn1, conn2 net.Conn) error {
defer conn1.Close()
Expand Down
Loading

0 comments on commit 4a34cfd

Please sign in to comment.