Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BPF Recorder: Exclude Container Initialization from Recorded Profile #2623

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 2 additions & 31 deletions examples/apparmorprofile-sleep-runc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,7 @@ spec:
allowedCapabilities:
- chown
- dac_override
- dac_read_search
- fsetid
- mknod
- setgid
- setpcap
- setuid
- sys_admin
- sys_chroot
executable:
Expand All @@ -28,32 +23,7 @@ spec:
- /lib/ld-musl-x86_64.so.1
filesystem:
readOnlyPaths:
- /
- /etc/
- /proc/@{pid}/maps
- /proc/@{pid}/setgroups
- /proc/@{pid}/task/@{tid}/fd/
- /proc/@{pid}/uid_map
- /proc/filesystems
- /proc/sys/kernel/cap_last_cap
- /run/
- /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
- /var/lib/containers/storage/overlay/*/merged/
- /var/lib/containers/storage/overlay/*/merged/dev/
- /var/lib/containers/storage/overlay/*/merged/dev/mqueue/
- /var/lib/containers/storage/overlay/*/merged/dev/pts/
- /var/lib/containers/storage/overlay/*/merged/dev/shm/
- /var/lib/containers/storage/overlay/*/merged/etc/
- /var/lib/containers/storage/overlay/*/merged/proc/
- /var/lib/containers/storage/overlay/*/merged/run/
- /var/lib/containers/storage/overlay/*/merged/run/secrets/
- /var/lib/containers/storage/overlay/*/merged/run/secrets/kubernetes.io/
- /var/lib/containers/storage/overlay/*/merged/run/secrets/kubernetes.io/serviceaccount/
- /var/lib/containers/storage/overlay/*/merged/sys/
- /var/lib/containers/storage/overlay/*/merged/sys/fs/cgroup/
readWritePaths:
- /dev/null
- /etc/*
writeOnlyPaths:
- /etc/alpine-release
- /etc/apk/arch
Expand All @@ -67,6 +37,7 @@ spec:
- /etc/busybox-paths.d/busybox
- /etc/crontabs/root
- /etc/fstab
- /etc/group
- /etc/hostname
- /etc/hosts
- /etc/inittab
Expand All @@ -80,6 +51,7 @@ spec:
- /etc/motd
- /etc/network/if-up.d/dad
- /etc/nsswitch.conf
- /etc/passwd
- /etc/profile
- /etc/profile.d/20locale.sh
- /etc/profile.d/README
Expand All @@ -101,7 +73,6 @@ spec:
- /lib/apk/db/lock
- /lib/apk/db/scripts.tar
- /lib/apk/db/triggers
- /proc/@{pid}/task/@{tid}/attr/apparmor/exec
- /sbin/apk
- /sbin/ldconfig
- /usr/bin/getconf
Expand Down
8 changes: 5 additions & 3 deletions hack/ci/e2e-apparmor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ APPARMOR_REFERENCE_TMP_PROFILE_FILE="/tmp/apparmorprofile-sleep-reference.yaml"
APPARMOR_PROFILE_FILE_COMPLAIN_MODE="examples/apparmorprofile-sleep-complain-mode.yaml"
SLEEP_INTERVAL_RECORDING="30" # 30s sleep interval during recording.
SLEEP_INTERVAL_VERIFICATION="300" # 5min to make sure that the enforcement check finds a running PID.
RUNTIMES=(runc crun)
RUNTIMES=(crun runc)
# Default location for CRI-O specific runtime binaries
export PATH="/usr/libexec/crio:$PATH"

Expand All @@ -40,6 +40,8 @@ check_apparmor_profile() {
cp "$APPARMOR_REFERENCE_PROFILE_FILE-$runtime.yaml" $APPARMOR_REFERENCE_TMP_PROFILE_FILE
yq -i ".spec" $APPARMOR_REFERENCE_TMP_PROFILE_FILE

cat $APPARMOR_PROFILE_FILE

diff $APPARMOR_REFERENCE_TMP_PROFILE_FILE $APPARMOR_PROFILE_FILE
}
create_runtimeclass() {
Expand Down Expand Up @@ -155,7 +157,7 @@ check_apparmor_profile_recording() {
echo "Deleting pod $PODNAME"
k delete -f "$pod_file"

echo "Deleting profile recoridng $RECORDING_NAME"
echo "Deleting profile recording $RECORDING_NAME"
k delete -f "$APPARMOR_RECORDING_FILE"

wait_for apparmorprofile $APPARMOR_PROFILE_NAME
Expand All @@ -164,7 +166,7 @@ check_apparmor_profile_recording() {
echo "Verifying apparmor profile"
echo "--------------------------"

echo "Checking the recorded appamror profile matches the reference"
echo "Checking the recorded apparmor profile matches the reference for $runtime"
check_apparmor_profile $runtime

echo "Creating pod $PODNAME with recorded profile in security context"
Expand Down
19 changes: 17 additions & 2 deletions hack/ci/e2e-seccomp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,27 @@ EOT
VERSION=$("$RUNTIME" --version | grep "$RUNTIME version" | grep -oP '\d+.*')
yq -i '.metadata.name = "'"$RUNTIME"'-v'"$VERSION"'"' "$BASEPROFILE"

echo $BASEPROFILE
cat $BASEPROFILE

echo "Deleting seccomp profile"
k delete seccompprofile $RECORDING
done

echo "Diffing output, while ignoring flaky syscalls 'rt_sigreturn', 'sched_yield', 'tgkill', and 'exit'"
git diff --exit-code -U0 -I rt_sigreturn -I sched_yield -I tgkill -I exit examples
# There is a weird phenomenon where we have a `runc` process
# that uses `setns` to join the container mount namespace.
# As a consequence, we sometimes get all the funny syscalls emitted
# by the Go runtime, which we need to ignore.
echo "Diffing output while ignoring flaky syscalls"
git diff --exit-code -U0 \
-I rt_sigreturn \
-I sched_yield \
-I tgkill \
-I exit \
-I madvise \
-I rt_sigprocmask \
-I sigaltstack\
examples

for RUNTIME in "${RUNTIMES[@]}"; do
echo "Verifying that the profile for runtime $RUNTIME is available in the GitHub container registry"
Expand Down
121 changes: 120 additions & 1 deletion internal/pkg/daemon/bpfrecorder/bpf/recorder.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
#define S_IFIFO 0010000
#define S_IFDIR 0040000

#define CLONE_NEWNS 0x00020000

#define CAP_OPT_NOAUDIT 0b10

#define SOCK_RAW 3
Expand All @@ -51,6 +53,9 @@ char LICENSE[] SEC("license") = "Dual BSD/GPL";
#define unlikely(x) __builtin_expect((x), 0)
#endif

#define trace_hook(...)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this look like an empty macro, I would remove it if it is not used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The intent here is that one can quickly toggle between no-op and bpf_printk, which makes debugging BPF much nicer. Does that make sense, any better suggestions?

// #define trace_hook(...) bpf_printk(__VA_ARGS__)

// Keep track of all mount namespaces that should be (temporarily) excluded from
// recording. When running in Kubernetes, we generally ignore the host mntns.
// Additionally, we exclude individual containers during startup.
Expand Down Expand Up @@ -92,6 +97,13 @@ struct {
__uint(max_entries, 1 << 24);
} events SEC(".maps");

struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 1000);
__type(key, u32);
__type(value, bool);
} runc_unshare SEC(".maps");

typedef struct __attribute__((__packed__)) event_data {
u32 pid;
u32 mntns;
Expand All @@ -102,6 +114,8 @@ typedef struct __attribute__((__packed__)) event_data {

const volatile char filter_name[MAX_COMM_LEN] = {};

static const char RUNC_CHILD[] = "runc:[1:CHILD]";
static const char RUNC_INIT[] = "runc:[2:INIT]";
static const bool TRUE = true;
static inline bool has_filter();
static inline bool matches_filter(char * comm);
Expand Down Expand Up @@ -144,6 +158,23 @@ static __always_inline u32 get_mntns()
return mntns;
}

// Debug method to report access to a canary file.
// This is useful during development to see if a particular code path is hit if
// bpf_printk output is inaccessible.
static __always_inline void debug_add_canary_file(char * filename) {
event_data_t * event = bpf_ringbuf_reserve(&events, sizeof(event_data_t), 0);
if (!event) {
bpf_printk("Failed to add canary file: %s", filename);
return;
}
bpf_core_read_str(event->data, sizeof(event->data), filename);
event->pid = bpf_get_current_pid_tgid() >> 32;
event->mntns = get_mntns();
event->type = EVENT_TYPE_APPARMOR_FILE;
event->flags = FLAG_READ | FLAG_WRITE;
bpf_ringbuf_submit(event, 0);
}

static u64 _file_event_inode;
static u64 _file_event_flags;
static u32 _file_event_pid;
Expand Down Expand Up @@ -331,6 +362,94 @@ int BPF_KPROBE(cap_capable)
return 0;
}

SEC("tracepoint/syscalls/sys_enter_unshare")
int sys_enter_unshare(struct trace_event_raw_sys_enter* ctx)
{
trace_hook("sys_enter_unshare");

int flags = ctx->args[0];
bool is_mnt = flags & CLONE_NEWNS;
// trace_hook("sys_enter_unshare mntns=%u is_mnt=%u", get_mntns(), is_mnt);
if(!is_mnt) {
return 0;
}

// Detect runc initialization
char comm[TASK_COMM_LEN] = {};
bpf_get_current_comm(comm, sizeof(comm));
for (int i = 0; i < sizeof(RUNC_CHILD); i++) {
if (comm[i] != RUNC_CHILD[i])
return 0;
}

trace_hook("detected runc init 1/3, waiting for exit...");
u32 pid = bpf_get_current_pid_tgid() >> 32;
bpf_map_update_elem(&runc_unshare, &pid, &TRUE, BPF_ANY);

return 0;
}

SEC("tracepoint/syscalls/sys_exit_unshare")
int sys_exit_unshare(struct trace_event_raw_sys_exit* ctx)
{
u32 mntns = get_mntns();
if (!mntns)
return 0;
trace_hook("sys_exit_unshare mntns=%u", mntns);

u32 pid = bpf_get_current_pid_tgid() >> 32;
if (bpf_map_delete_elem(&runc_unshare, &pid) == 0) {
trace_hook("detected runc init 2/3, marking new mntns for exclusion: %u", mntns);
u8 expected_ppid_calls = 2;
bpf_map_update_elem(&exclude_mntns, &mntns, &expected_ppid_calls, BPF_ANY);

// FIXME: delete to figure out what's going on here.
// bpf_map_delete_elem(&mntns_syscalls, &mntns);
}
return 0;
}

SEC("tracepoint/syscalls/sys_enter_getppid")
int sys_enter_getppid(struct trace_event_raw_sys_enter * ctx)
{
// trace_hook("sys_enter_getppid");

// Handle runc init.
char comm[TASK_COMM_LEN] = {};
bpf_get_current_comm(comm, sizeof(comm));
for (int i = 0; i < sizeof(RUNC_INIT); i++) {
if (comm[i] != RUNC_INIT[i])
return 0;
}

// We expect 2 getppid calls in runc's init,
// and we want to stop ignoring events on the second one.
//
// We could further minimize profiles by waiting until execve instead of getppid.
// This would immediately work for AppArmor (which becomes active from the next execve),
// but would miss the syscalls for seccomp (which becomes active immediately, so we need to include permissions
// for the time between enforcement and the execve call).
// Not doing that yet because splitting AppArmor and seccomp logic adds a lot of complexity;
// hardcoding a list of syscalls required by runc creates maintenance burden.
struct task_struct * task = (struct task_struct *)bpf_get_current_task();
u32 mntns = BPF_CORE_READ(task, nsproxy, mnt_ns, ns.inum);
u8 * calls = bpf_map_lookup_elem(&exclude_mntns, &mntns);
if(calls == NULL) {
trace_hook("runc: unexpected getppid call", mntns);
return 0;
}
(*calls)--;
if(*calls > 0) {
trace_hook("detected runc init 3/4, waiting for %u more calls for mntns %u", *calls, mntns);
bpf_map_update_elem(&exclude_mntns, &mntns, calls, BPF_ANY);
} else {
trace_hook("detected runc init 4/4, reenabling mntns %u", mntns);
bpf_map_delete_elem(&exclude_mntns, &mntns);
}

return 0;
}

SEC("tracepoint/sched/sched_process_exec")
int sched_process_exec(struct trace_event_raw_sched_process_exec * ctx)
{
Expand All @@ -347,7 +466,7 @@ int sched_process_exec(struct trace_event_raw_sched_process_exec * ctx)

if (is_child || matches_filter(comm)) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
bpf_printk("adding child pid: %u", pid);
trace_hook("adding child pid: %u comm=%s", pid, comm);
bpf_map_update_elem(&child_pids, &pid, &TRUE, BPF_ANY);
}
return 0;
Expand Down
Binary file modified internal/pkg/daemon/bpfrecorder/bpf/recorder.bpf.o.amd64
Binary file not shown.
Binary file modified internal/pkg/daemon/bpfrecorder/bpf/recorder.bpf.o.arm64
Binary file not shown.
7 changes: 6 additions & 1 deletion internal/pkg/daemon/bpfrecorder/bpfrecorder.go
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,9 @@ func (b *BpfRecorder) getMntnsForProfile(profile string) (uint32, bool) {
var baseHooks = []string{
"sys_enter",
"sys_exit_clone",
"sys_enter_getppid",
"sys_enter_unshare",
"sys_exit_unshare",
"sched_process_exec",
"sched_process_exit",
}
Expand Down Expand Up @@ -668,7 +671,9 @@ func (b *BpfRecorder) handleEvent(eventBytes []byte) {
case uint8(eventTypeExit):
b.handleExitEvent(&event)
case uint8(eventTypeAppArmorFile):
b.AppArmor.handleFileEvent(&event)
if b.AppArmor != nil {
b.AppArmor.handleFileEvent(&event)
}
case uint8(eventTypeAppArmorSocket):
b.AppArmor.handleSocketEvent(&event)
case uint8(eventTypeAppArmorCap):
Expand Down
Loading