iT邦幫忙

2025 iThome 鐵人賽

DAY 12
0

如果覺得文章對你有所啟發,可以考慮用 🌟 支持 Gthulhu 專案,短期目標是集齊 300 個 🌟 藉此被 CNCF Landscape 採納 [ref]

sched_ext

Linux kernel 自 v6.12 開始支援 sched_ext(Scheduler Extesion)[1],它賦予了我們在 user space 動態插入系統排程器的能力:

  • 以 eBPF program 的形式客製化熱插拔的 OS scheduler。
  • Kernel 內建 watch dog 避免 deadlock 以及 starvation,如果 custom scheduler 沒辦法在一段時間為所有任務排程,那系統會將你注入的排程器剔除。
  • BPF 保證了安全性(沒有記憶體錯誤、沒有 kernel panic)。

筆者補充:
Watch Dog 本身實作於 Linux 的 Concurrency Managed Workqueue(CMWQ)機制上,所以這個 task 本身會被 kworker 排程。若你實作的排程器沒辦法讓 task 對應的 kworker 在 scx 設定的 timeout 被分配到 CPU 且執行完畢,那麼你的 scheduler 就會被系統踢出。

BPF_STRUCT_OPS 的用途

/*
 * Decide which CPU a task should be migrated to before being
 * enqueued (either at wakeup, fork time, or exec time). If an
 * idle core is found by the default ops.select_cpu() implementation,
 * then insert the task directly into SCX_DSQ_LOCAL and skip the
 * ops.enqueue() callback.
 *
 * Note that this implementation has exactly the same behavior as the
 * default ops.select_cpu implementation. The behavior of the scheduler
 * would be exactly same if the implementation just didn't define the
 * simple_select_cpu() struct_ops prog.
 */
s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p,
                   s32 prev_cpu, u64 wake_flags)
{
        s32 cpu;
        /* Need to initialize or the BPF verifier will reject the program */
        bool direct = false;

        cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &direct);

        if (direct)
                scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);

        return cpu;
}

參考 Linux Kernel 官方文件 [3] 提供的範例,我們可以看到範例中的函式都使用了 BPF_STRUCT_OPS 這個 MACRO:

#define BPF_STRUCT_OPS(name, args...)      \
SEC("struct_ops/"#name)        \
BPF_PROG(name, ##args)

這個 MACRO 會將函式轉換為 BPF struct_ops(也就是 BPF_PROG_TYPE_STRUCT_OPS 類型的 eBPF program [6]),從 Linux Plumbers Conerference 的這場分享 [4] 可以得知 BPF struct_ops 是 Kernel 提供的一個方法,讓 Kernel 的子系統能夠使用者定義的函式:

image

來源:[4]

image

來源:[5]

實作 sched_ext 的 hook function

回到 scx_simple 這個範例:

SEC(".struct_ops")
struct sched_ext_ops simple_ops = {
        .select_cpu             = (void *)simple_select_cpu,
        .enqueue                = (void *)simple_enqueue,
        .init                   = (void *)simple_init,
        .exit                   = (void *)simple_exit,
        .name                   = "simple",
};

最終我們會將前面定義好的 eBPF prog 以函式指標的方式指派到 sched_ext_ops 這個結構中。
而 sched_ext_ops 這個結構實際上是在 kernel source 定義好的 dummy interface,參考 kernel/sched/ext.c

static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
 .select_cpu = select_cpu_stub,
 .enqueue = enqueue_stub,
 .dequeue = dequeue_stub,
 .dispatch = dispatch_stub,
 .tick = tick_stub,
 .runnable = runnable_stub,
 .running = running_stub,
 .stopping = stopping_stub,
 .quiescent = quiescent_stub,
 .yield = yield_stub,
 .core_sched_before = core_sched_before_stub,
 .set_weight = set_weight_stub,
 .set_cpumask = set_cpumask_stub,
 .update_idle = update_idle_stub,
 .cpu_acquire = cpu_acquire_stub,
 .cpu_release = cpu_release_stub,
 .init_task = init_task_stub,
 .exit_task = exit_task_stub,
 .enable = enable_stub,
 .disable = disable_stub,
#ifdef CONFIG_EXT_GROUP_SCHED
 .cgroup_init = cgroup_init_stub,
 .cgroup_exit = cgroup_exit_stub,
 .cgroup_prep_move = cgroup_prep_move_stub,
 .cgroup_move = cgroup_move_stub,
 .cgroup_cancel_move = cgroup_cancel_move_stub,
 .cgroup_set_weight = cgroup_set_weight_stub,
#endif
 .cpu_online = cpu_online_stub,
 .cpu_offline = cpu_offline_stub,
 .init = init_stub,
 .exit = exit_stub,
 .dump = dump_stub,
 .dump_cpu = dump_cpu_stub,
 .dump_task = dump_task_stub,
};

static struct bpf_struct_ops bpf_sched_ext_ops = {
 .verifier_ops = &bpf_scx_verifier_ops,
 .reg = bpf_scx_reg,
 .unreg = bpf_scx_unreg,
 .check_member = bpf_scx_check_member,
 .init_member = bpf_scx_init_member,
 .init = bpf_scx_init,
 .update = bpf_scx_update,
 .validate = bpf_scx_validate,
 .name = "sched_ext_ops",
 .owner = THIS_MODULE,
 .cfi_stubs = &__bpf_ops_sched_ext_ops
};

最後在 scx_init 完成對該介面的註冊 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
上面的解說可能沒辦法讓大家完全理解 struct_ops,所以筆者使用 bpftool 來觀察當 eBPF scheduler attached 後,會出現哪些新的資訊:

> sudo bpftool map list
[sudo] password for ian: 
6362: percpu_array  name cpu_ctx_stor  flags 0x0
        key 4B  value 24B  max_entries 1  memlock 752B
        btf_id 5965
        pids main(791867)
6363: task_storage  name task_ctx_stor  flags 0x1
        key 4B  value 32B  max_entries 0  memlock 912B
        btf_id 5965
        pids main(791867)
6364: ringbuf  name queued  flags 0x0
        key 0B  value 0B  max_entries 262144  memlock 275736B
        pids main(791867)
6365: user_ringbuf  name dispatched  flags 0x0
        key 0B  value 0B  max_entries 262144  memlock 275736B
        pids main(791867)
6366: hash  name pid_mm_fault_ma  flags 0x0
        key 4B  value 8B  max_entries 4096  memlock 330656B
        btf_id 5965
        pids main(791867)
6367: array  name usersched_timer  flags 0x0
        key 4B  value 16B  max_entries 1  memlock 280B
        btf_id 5965
        pids main(791867)
6369: array  name main.rodata  flags 0x480
        key 4B  value 504B  max_entries 1  memlock 8192B
        btf_id 5965  frozen
        pids main(791867)
6370: array  name .data.uei_dump  flags 0x400
        key 4B  value 1B  max_entries 1  memlock 8192B
        btf_id 5965
        pids main(791867)
6371: array  name main.data  flags 0x400
        key 4B  value 1425B  max_entries 1  memlock 8192B
        btf_id 5965
        pids main(791867)
6372: array  name main.bss  flags 0x400
        key 4B  value 92B  max_entries 1  memlock 8192B
        btf_id 5965
        pids main(791867)
6373: struct_ops  name goland  flags 0x2000
        key 4B  value 512B  max_entries 1  memlock 6096B
        btf_id 5965
        pids main(791867)
6376: array  name libbpf_global  flags 0x0
        key 4B  value 32B  max_entries 1  memlock 296B
6377: array  name pid_iter.rodata  flags 0x480
        key 4B  value 4B  max_entries 1  memlock 8192B
        btf_id 5976  frozen
        pids bpftool(792421)
6378: array  name libbpf_det_bind  flags 0x0
        key 4B  value 32B  max_entries 1  memlock 296B

只要是包含 pid main(791867) 資訊的 BPF MAP 都是由 eBPF scheduler 建立的 BPF MAP。接著,讓我們觀察 goland 的內容:

> sudo bpftool map dump name goland
[{
        "value": {
            "common": {
                "refcnt": {
                    "refs": {
                        "counter": 1
                    }
                },
                "state": "BPF_STRUCT_OPS_STATE_READY"
            },
            "data": {
                "select_cpu": "0x28aa",
                "enqueue": "0x28ad",
                "dequeue": "(nil)",
                "dispatch": "0x28ae",
                "tick": "(nil)",
                "runnable": "(nil)",
                "running": "0x28af",
                "stopping": "0x28b0",
                "quiescent": "(nil)",
                "yield": "(nil)",
                "core_sched_before": "(nil)",
                "set_weight": "(nil)",
                "set_cpumask": "0x28b2",
                "update_idle": "0x28b1",
                "cpu_acquire": "(nil)",
                "cpu_release": "0x28b3",
                "init_task": "0x28b4",
                "exit_task": "(nil)",
                "enable": "(nil)",
                "disable": "(nil)",
                "dump": "(nil)",
                "dump_cpu": "(nil)",
                "dump_task": "(nil)",
                "cgroup_init": "(nil)",
                "cgroup_exit": "(nil)",
                "cgroup_prep_move": "(nil)",
                "cgroup_move": "(nil)",
                "cgroup_cancel_move": "(nil)",
                "cgroup_set_weight": "(nil)",
                "cpu_online": "(nil)",
                "cpu_offline": "(nil)",
                "init": "0x28b5",
                "exit": "0x28b6",
                "dispatch_max_batch": 512,
                "flags": 3,
                "timeout_ms": 5000,
                "exit_dump_len": 0,
                "hotplug_seq": 0,
                "name": "goland"
            }
        }
    }
]

我們就能找到每一個 scheduler ops 對應的函式指標位址囉。

總結

透過這篇文章可以得知:

  • 使用 scx 開發自己的排程器需要遵守 struct_ops 的規範
  • struct_ops 就像是用一個 member 都是 function pointer 的結構將每個排程器的 hook function 鏈結起來
  • 將 struct_ops 對應的 Map 載入到系統後,kernel 就能得知每個 hook function 對應的記憶體位址,接著系統就會在排程的各個進入點呼叫對應的 hook function。

至於排程器的各個進入點是什麼?就交給明天的文章探討囉。

References

  1. https://www.phoronix.com/news/Linux-6.12-Lands-sched-ext
  2. Crafting a Linux kernel scheduler in Rust - Andrea Righi
  3. https://www.kernel.org/doc/html/v6.12/scheduler/sched-ext.html
  4. https://lpc.events/event/17/contributions/1607/attachments/1164/2407/lpc-struct_ops.pdf
  5. http://oldvger.kernel.org/bpfconf2024_material/struct_ops-lsfmmbpf-2024.pdf
  6. https://docs.ebpf.io/linux/program-type/BPF_PROG_TYPE_STRUCT_OPS/

上一篇
eBPF skeleton
系列文
30 篇文帶你用 eBPF 與 Golang 打造 Linux Scheduler12
圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言