👨‍⚕️

/var/log/netflix/noisy-neighbor-detection-with-ebpf

2024/09/16に公開

Netflix のマルチテナントでの隣人問題で eBPF を用いて調査した記事のメモ。

Gigazine による抜粋 & 日本語: https://gigazine.net/news/20240916-netflix-ebpf-noisy-neighbor/

調査した考察なども興味深かったが、それよりもコンテナを eBPF で監視で面白かったので手を動かしてみた。

コンテナは cgroup と紐付ける

eBPF の世界でコンテナを紐付けるのはコンテナが cgroups を使う前提をおいて、 cgroup_id というのを task から持ってくる以外は現状はないと思う。
cgroup_id を持ってくるときに RCU(Read Copy Update) lockというのを取っておかないといけないのを知らなかった。初めましてだったのでまた深掘りしたい。

u64 get_task_cgroup_id(struct task_struct *task)
{
    struct css_set *cgroups;
    u64 cgroup_id;
    bpf_rcu_read_lock();
    cgroups = task->cgroups;
    cgroup_id = cgroups->dfl_cgrp->kn->id;
    bpf_rcu_read_unlock();
    return cgroup_id;
}

動かす

元記事だと Ring Buffer を使っているが、それを可視化しようと思うといろいろ準備が必要そうなので hist を用いてごにょごにょした。

コード

noisy_neighbor.bpf.c

/*
Most of the code is extracted from:
https://netflixtechblog.com/noisy-neighbor-detection-with-ebpf-64b1f4b3bbdd
*/
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
#include "maps.bpf.h"
#include "bits.bpf.h"
#include "noisy_neighbor.h"

#define MAX_TASK_ENTRIES 1024
#define RINGBUF_SIZE_BYTES 65536
#define RATE_LIMIT_NS 1000000000
#define MAX_ENTRIES 10240

const volatile bool ms = false;

struct {
    __uint(type, BPF_MAP_TYPE_HASH);
    __uint(max_entries, MAX_TASK_ENTRIES);
    __uint(key_size, sizeof(u32));
    __uint(value_size, sizeof(u64));
} runq_enqueued SEC(".maps");

/// @sample {"interval": 1000, "type" : "log2_hist"}
struct {
 __uint(type, BPF_MAP_TYPE_HASH);
 __uint(max_entries, MAX_ENTRIES);
 __type(key, u32);
 __type(value, struct hist);
} hists SEC(".maps");

static struct hist zero;

void bpf_rcu_read_lock(void) __ksym;
void bpf_rcu_read_unlock(void) __ksym;

u64 get_task_cgroup_id(struct task_struct *task)
{
    struct css_set *cgroups;
    u64 cgroup_id;
    bpf_rcu_read_lock();
    cgroups = task->cgroups;
    cgroup_id = cgroups->dfl_cgrp->kn->id;
    bpf_rcu_read_unlock();
    return cgroup_id;
}

SEC("tp_btf/sched_wakeup")
int tp_sched_wakeup(u64 *ctx)
{
    struct task_struct *task = (void *)ctx[0];
    u32 pid = task->pid;
    u64 ts = bpf_ktime_get_ns();

    bpf_map_update_elem(&runq_enqueued, &pid, &ts, BPF_NOEXIST);
    return 0;
}

struct {
    __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
    __uint(max_entries, MAX_TASK_ENTRIES);
    __uint(key_size, sizeof(u64));
    __uint(value_size, sizeof(u64));
} cgroup_id_to_last_event_ts SEC(".maps");


SEC("tp_btf/sched_switch")
int tp_sched_switch(u64 *ctx)
{
    struct task_struct *prev = (struct task_struct *)ctx[1];
    struct task_struct *next = (struct task_struct *)ctx[2];
    u32 next_pid = next->pid;
    u64 slot;
    struct hist *histp;

    // fetch timestamp of when the next task was enqueued
    u64 *tsp = bpf_map_lookup_elem(&runq_enqueued, &next_pid);
    if (tsp == NULL) {
        return 0; // missed enqueue
    }

    // calculate runq latency before deleting the stored timestamp
    u64 now = bpf_ktime_get_ns();
    u64 runq_lat = now - *tsp;

    // delete pid from enqueued map
    bpf_map_delete_elem(&runq_enqueued, &next_pid);

    u64 prev_cgroup_id = get_task_cgroup_id(prev);
    u64 cgroup_id = get_task_cgroup_id(next);

    // per-cgroup-id-per-CPU rate-limiting
    // to balance observability with performance overhead
    u64 *last_ts =
        bpf_map_lookup_elem(&cgroup_id_to_last_event_ts, &cgroup_id);
    u64 last_ts_val = last_ts == NULL ? 0 : *last_ts;

    // check the rate limit for the cgroup_id in consideration
    // before doing more work
    if (now - last_ts_val < RATE_LIMIT_NS) {
        // Rate limit exceeded, drop the event
        return 0;
    }

    histp = bpf_map_lookup_or_try_init(&hists, &cgroup_id, &zero);
    if (!histp)
        return 0;

    bpf_probe_read_kernel_str(&histp->comm, sizeof(histp->comm), next->comm);
    bpf_probe_read_kernel_str(&histp->cgroup_name, sizeof(next->cgroups->dfl_cgrp->kn->name), next->cgroups->dfl_cgrp->kn->name);

    if (ms)
        runq_lat /= 1000000U;

    slot = log2l(runq_lat);
    if (slot >= MAX_SLOTS)
        slot = MAX_SLOTS - 1;

    __sync_fetch_and_add(&histp->slots[slot], 1);

    return 0;
}

char LICENSE[] SEC("license") = "GPL";

noisy_neighbor.h

#ifndef __NOISY_NEIGHBOR_H
#define __NOISY_NEIGHBOR_H

#define TASK_COMM_LEN 512
#define CGROUP_NAME_LEN 512
#define MAX_SLOTS 256

struct hist {
    __u32 slots[MAX_SLOTS];
    char comm[TASK_COMM_LEN];
    char cgroup_name[CGROUP_NAME_LEN];
};

#endif /* __NOISY_NEIGHBOR_H */

vmlinux.h は以下で生成できます。

$ bpftool btf dump file /sys/kernel/btf/vmlinux format c > vmlinux.h

maps.bpf.h と bits.bpf.h はライセンスを確認して以下のリポジトリから探して利用しました。

ビルド。今回は eunomia なるものを使ってみた。普通に便利だった。このプロジェクトも今後が楽しみ。

ecc noisy_neighbor.bpf.c noisy_neighbor.h
sudo ecli run package.json

それっぽいのが出てきた。keyが cgroup id となっている。単位は ns。 unit の指定の仕方がわからなかった...

key =  734533
comm = kubelet
cgroup_name = kubelet

     (unit)              : count    distribution
         0 -> 1          : 0        |                                        |
         2 -> 3          : 0        |                                        |
         4 -> 7          : 0        |                                        |
         8 -> 15         : 0        |                                        |
        16 -> 31         : 0        |                                        |
        32 -> 63         : 0        |                                        |
        64 -> 127        : 0        |                                        |
       128 -> 255        : 0        |                                        |
       256 -> 511        : 0        |                                        |
       512 -> 1023       : 234      |****************************            |
      1024 -> 2047       : 167      |********************                    |
      2048 -> 4095       : 279      |*********************************       |
      4096 -> 8191       : 318      |**************************************  |
      8192 -> 16383      : 334      |****************************************|
     16384 -> 32767      : 147      |*****************                       |
     32768 -> 65535      : 16       |*                                       |
     65536 -> 131071     : 1        |                                        |
    131072 -> 262143     : 2        |                                        |
    262144 -> 524287     : 1        |                                        |
    524288 -> 1048575    : 0        |                                        |
   1048576 -> 2097151    : 0        |                                        |
   2097152 -> 4194303    : 0        |                                        |
   4194304 -> 8388607    : 0        |                                        |
   8388608 -> 16777215   : 0        |                                        |
  16777216 -> 33554431   : 0        |                                        |
  33554432 -> 67108863   : 1        |                                        |

この場合は cgroup id が 734533 となっている。cgroup id は inode と一緒なので確認することができる。

$ find /sys/fs/cgroup -inum 734533
/sys/fs/cgroup/system.slice/docker-200cc329c4c0857930a381b3efe07499cb5efd3a5fdb58753e48cc32d00da9bd.scope/kubelet.slice/kubelet.service

kind で起動している kubelet らしい。ちゃんととれていそう。

感想

このあたりの cgroup まわりの取り回しがちょっと面倒だなと毎回思うのでいい感じのツールができてきて containerd の API とかと連携できるときっと Pod まできれいに辿れるの Kubernetes でいろいろ使えるようになるのもそう遠くない気がしました。

コンテナは cgroup と紐付ける

動かす

感想

Discussion