如果覺得文章對你有所啟發,可以考慮用 🌟 支持 Gthulhu 專案,短期目標是集齊 300 個 🌟 藉此被 CNCF Landscape 採納 [ref]。
Linux kernel 自 v6.12 開始支援 sched_ext(Scheduler Extesion)[1],它賦予了我們在 user space 動態插入系統排程器的能力:
筆者補充:
Watch Dog 本身實作於 Linux 的 Concurrency Managed Workqueue(CMWQ)機制上,所以這個 task 本身會被 kworker 排程。若你實作的排程器沒辦法讓 task 對應的 kworker 在 scx 設定的 timeout 被分配到 CPU 且執行完畢,那麼你的 scheduler 就會被系統踢出。
/*
* Decide which CPU a task should be migrated to before being
* enqueued (either at wakeup, fork time, or exec time). If an
* idle core is found by the default ops.select_cpu() implementation,
* then insert the task directly into SCX_DSQ_LOCAL and skip the
* ops.enqueue() callback.
*
* Note that this implementation has exactly the same behavior as the
* default ops.select_cpu implementation. The behavior of the scheduler
* would be exactly same if the implementation just didn't define the
* simple_select_cpu() struct_ops prog.
*/
s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
s32 cpu;
/* Need to initialize or the BPF verifier will reject the program */
bool direct = false;
cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &direct);
if (direct)
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
return cpu;
}
參考 Linux Kernel 官方文件 [3] 提供的範例,我們可以看到範例中的函式都使用了 BPF_STRUCT_OPS 這個 MACRO:
#define BPF_STRUCT_OPS(name, args...) \
SEC("struct_ops/"#name) \
BPF_PROG(name, ##args)
這個 MACRO 會將函式轉換為 BPF struct_ops(也就是 BPF_PROG_TYPE_STRUCT_OPS
類型的 eBPF program [6]),從 Linux Plumbers Conerference 的這場分享 [4] 可以得知 BPF struct_ops 是 Kernel 提供的一個方法,讓 Kernel 的子系統能夠使用者定義的函式:
來源:[4]
來源:[5]
回到 scx_simple 這個範例:
SEC(".struct_ops")
struct sched_ext_ops simple_ops = {
.select_cpu = (void *)simple_select_cpu,
.enqueue = (void *)simple_enqueue,
.init = (void *)simple_init,
.exit = (void *)simple_exit,
.name = "simple",
};
最終我們會將前面定義好的 eBPF prog 以函式指標的方式指派到 sched_ext_ops 這個結構中。
而 sched_ext_ops 這個結構實際上是在 kernel source 定義好的 dummy interface,參考 kernel/sched/ext.c
:
static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
.select_cpu = select_cpu_stub,
.enqueue = enqueue_stub,
.dequeue = dequeue_stub,
.dispatch = dispatch_stub,
.tick = tick_stub,
.runnable = runnable_stub,
.running = running_stub,
.stopping = stopping_stub,
.quiescent = quiescent_stub,
.yield = yield_stub,
.core_sched_before = core_sched_before_stub,
.set_weight = set_weight_stub,
.set_cpumask = set_cpumask_stub,
.update_idle = update_idle_stub,
.cpu_acquire = cpu_acquire_stub,
.cpu_release = cpu_release_stub,
.init_task = init_task_stub,
.exit_task = exit_task_stub,
.enable = enable_stub,
.disable = disable_stub,
#ifdef CONFIG_EXT_GROUP_SCHED
.cgroup_init = cgroup_init_stub,
.cgroup_exit = cgroup_exit_stub,
.cgroup_prep_move = cgroup_prep_move_stub,
.cgroup_move = cgroup_move_stub,
.cgroup_cancel_move = cgroup_cancel_move_stub,
.cgroup_set_weight = cgroup_set_weight_stub,
#endif
.cpu_online = cpu_online_stub,
.cpu_offline = cpu_offline_stub,
.init = init_stub,
.exit = exit_stub,
.dump = dump_stub,
.dump_cpu = dump_cpu_stub,
.dump_task = dump_task_stub,
};
static struct bpf_struct_ops bpf_sched_ext_ops = {
.verifier_ops = &bpf_scx_verifier_ops,
.reg = bpf_scx_reg,
.unreg = bpf_scx_unreg,
.check_member = bpf_scx_check_member,
.init_member = bpf_scx_init_member,
.init = bpf_scx_init,
.update = bpf_scx_update,
.validate = bpf_scx_validate,
.name = "sched_ext_ops",
.owner = THIS_MODULE,
.cfi_stubs = &__bpf_ops_sched_ext_ops
};
最後在 scx_init 完成對該介面的註冊 ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
。
上面的解說可能沒辦法讓大家完全理解 struct_ops,所以筆者使用 bpftool 來觀察當 eBPF scheduler attached 後,會出現哪些新的資訊:
> sudo bpftool map list
[sudo] password for ian:
6362: percpu_array name cpu_ctx_stor flags 0x0
key 4B value 24B max_entries 1 memlock 752B
btf_id 5965
pids main(791867)
6363: task_storage name task_ctx_stor flags 0x1
key 4B value 32B max_entries 0 memlock 912B
btf_id 5965
pids main(791867)
6364: ringbuf name queued flags 0x0
key 0B value 0B max_entries 262144 memlock 275736B
pids main(791867)
6365: user_ringbuf name dispatched flags 0x0
key 0B value 0B max_entries 262144 memlock 275736B
pids main(791867)
6366: hash name pid_mm_fault_ma flags 0x0
key 4B value 8B max_entries 4096 memlock 330656B
btf_id 5965
pids main(791867)
6367: array name usersched_timer flags 0x0
key 4B value 16B max_entries 1 memlock 280B
btf_id 5965
pids main(791867)
6369: array name main.rodata flags 0x480
key 4B value 504B max_entries 1 memlock 8192B
btf_id 5965 frozen
pids main(791867)
6370: array name .data.uei_dump flags 0x400
key 4B value 1B max_entries 1 memlock 8192B
btf_id 5965
pids main(791867)
6371: array name main.data flags 0x400
key 4B value 1425B max_entries 1 memlock 8192B
btf_id 5965
pids main(791867)
6372: array name main.bss flags 0x400
key 4B value 92B max_entries 1 memlock 8192B
btf_id 5965
pids main(791867)
6373: struct_ops name goland flags 0x2000
key 4B value 512B max_entries 1 memlock 6096B
btf_id 5965
pids main(791867)
6376: array name libbpf_global flags 0x0
key 4B value 32B max_entries 1 memlock 296B
6377: array name pid_iter.rodata flags 0x480
key 4B value 4B max_entries 1 memlock 8192B
btf_id 5976 frozen
pids bpftool(792421)
6378: array name libbpf_det_bind flags 0x0
key 4B value 32B max_entries 1 memlock 296B
只要是包含 pid main(791867) 資訊的 BPF MAP 都是由 eBPF scheduler 建立的 BPF MAP。接著,讓我們觀察 goland 的內容:
> sudo bpftool map dump name goland
[{
"value": {
"common": {
"refcnt": {
"refs": {
"counter": 1
}
},
"state": "BPF_STRUCT_OPS_STATE_READY"
},
"data": {
"select_cpu": "0x28aa",
"enqueue": "0x28ad",
"dequeue": "(nil)",
"dispatch": "0x28ae",
"tick": "(nil)",
"runnable": "(nil)",
"running": "0x28af",
"stopping": "0x28b0",
"quiescent": "(nil)",
"yield": "(nil)",
"core_sched_before": "(nil)",
"set_weight": "(nil)",
"set_cpumask": "0x28b2",
"update_idle": "0x28b1",
"cpu_acquire": "(nil)",
"cpu_release": "0x28b3",
"init_task": "0x28b4",
"exit_task": "(nil)",
"enable": "(nil)",
"disable": "(nil)",
"dump": "(nil)",
"dump_cpu": "(nil)",
"dump_task": "(nil)",
"cgroup_init": "(nil)",
"cgroup_exit": "(nil)",
"cgroup_prep_move": "(nil)",
"cgroup_move": "(nil)",
"cgroup_cancel_move": "(nil)",
"cgroup_set_weight": "(nil)",
"cpu_online": "(nil)",
"cpu_offline": "(nil)",
"init": "0x28b5",
"exit": "0x28b6",
"dispatch_max_batch": 512,
"flags": 3,
"timeout_ms": 5000,
"exit_dump_len": 0,
"hotplug_seq": 0,
"name": "goland"
}
}
}
]
我們就能找到每一個 scheduler ops 對應的函式指標位址囉。
透過這篇文章可以得知:
至於排程器的各個進入點是什麼?就交給明天的文章探討囉。