本文主要是介绍一次 irq_fpu_usable 内核报错排查总结,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
开发了一个内核ko模块async_memory_reclaim_for_cold_file_area(内核版本5.14.0-284.11.1),使用kprobe技术捕捉内核copy_page_to_iter()函数,在里边执行自定义的hot_file_update_file_status()函数,统计文件页page的访问信息,源码简要如下:
- static void kprobe_handler_post(struct kprobe *p, struct pt_regs *regs,
- unsigned long flags)
- {
- struct page *page = (struct page *)(regs->di);
- if(page){
- hot_file_update_file_status(page);
- }
- }
- static struct kprobe kp_read_cache_func = {
- //buffer io read读取文件页page数据执行到 copy_folio_to_iter()
- .symbol_name = "copy_page_to_iter",
- };
- kp_read_cache_func.post_handler = kprobe_handler_post;
- //注册kprobe函数
- ret = register_kprobe(&kp_read_cache_func);
但是偶发触发内核异常告警,如下:
- [ 2606.173350] WARNING: CPU: 3 PID: 3347 at arch/x86/kernel/fpu/core.c:60 irq_fpu_usable+0x39/0x50
- [ 2606.173420] fuse [last unloaded: async_memory_reclaim_for_cold_file_area_debug]
- [ 2606.173423] CPU: 3 PID: 3347 Comm: bash Kdump: loaded Tainted: G OE -------- h--- 5.14.0-284.11.1.el9_2.x86_64 #1
- [ 2606.173425] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
- [ 2606.173426] RIP: 0010:irq_fpu_usable+0x39/0x50
- [ 2606.173432] RSP: 0018:ffffb6fe009076e8 EFLAGS: 00010006
- [ 2606.173434] RAX: ffffffffa30c6794 RBX: ffffffffa30c6740 RCX: 0000000000000024
- [ 2606.173435] RDX: 0000000080110002 RSI: ffffffffa30c6770 RDI: ffffffffa30c6740
- [ 2606.173435] RBP: ffffffffa30c6770 R08: 0000000000000004 R09: 0000000000000004
- [ 2606.173436] R10: ffffffffa30c6794 R11: 00000000000319c0 R12: 0000000000000001
- [ 2606.173437] R13: ffffffffa30c6740 R14: 0000000000000024 R15: fffffc6140046780
- [ 2606.173438] FS: 00007f6c674d4740(0000) GS:ffff971e39ec0000(0000) knlGS:0000000000000000
- [ 2606.173461] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
- [ 2606.173462] CR2: 0000559a22f5dad0 CR3: 000000000c066004 CR4: 00000000003706e0
- [ 2606.173475] blake2s_compress+0x20/0xa0
- [ 2606.173480] blake2s_final+0x41/0x80
- [ 2606.173483] extract_entropy.constprop.0+0x94/0x250
- [ 2606.173493] crng_make_state+0x129/0x180
- [ 2606.173495] _get_random_bytes.part.0+0x4c/0x190
- [ 2606.173497] ? __alloc_pages+0xe6/0x230
- [ 2606.173499] get_random_u32+0x59/0x90
- [ 2606.173501] allocate_slab+0x14a/0x460
- [ 2606.173505] ___slab_alloc+0x44f/0x570
- [ 2606.173509] ? hot_file_update_file_status+0x164/0xa60 [async_memory_reclaim_for_cold_file_area]
- [ 2606.173513] ? find_busiest_group+0x65/0x190
- [ 2606.173516] ? hot_file_update_file_status+0x164/0xa60 [async_memory_reclaim_for_cold_file_area]
- [ 2606.173519] kmem_cache_alloc+0x294/0x300
- [ 2606.173522] hot_file_update_file_status+0x164/0xa60 [async_memory_reclaim_for_cold_file_area]
- [ 2606.173526] ? 0xffffffffc0572011
- [ 2606.173528] kprobe_post_process+0x25/0x70
- [ 2606.173531] ? 0xffffffffc0572012
- [ 2606.173532] kprobe_int3_handler+0x154/0x180
- [ 2606.173534] do_int3+0x3b/0x80
- [ 2606.173536] exc_int3+0x81/0xc0
- [ 2606.173546] RSP: 0018:ffffb6fe00907bf0 EFLAGS: 00000246
- [ 2606.173548] RBP: 0000000000000000 R08: 0000000000000402 R09: ffff971e10b97500
- [ 2606.173549] R10: ffffb6fe00907d98 R11: ffffffffffffffff R12: ffffb6fe00907d38
- [ 2606.173550] R13: ffffb6fe00907d60 R14: 0000000000000100 R15: 0000000000000000
- [ 2606.173553] ? 0xffffffffc0572012
- [ 2606.173554] filemap_read+0x18a/0x320
- ...........................
显然是irq_fpu_usable()函数里触发的问题。为了查清楚根因,/proc/sys/kernel/panic_on_warn置1。下次触发这个内核告警后,触发了内核crash,并保存了vmcoe。通过vmcore看到出问题时完整函数流程:
- crash> bt
- PID: 33 TASK: ffff971d01358000 CPU: 3 COMMAND: "bash"
- #0 [ffffb6fe00907400] machine_kexec at ffffffffa126c237
- #1 [ffffb6fe00907458] __crash_kexec at ffffffffa13c3c9a
- #2 [ffffb6fe00907518] panic at ffffffffa1cd7dc6
- #3 [ffffb6fe00907598] __warn.cold at ffffffffa1cd7feb
- #4 [ffffb6fe009075d0] report_bug at ffffffffa17ac20e
- #5 [ffffb6fe00907608] handle_bug at ffffffffa1d254bc
- #6 [ffffb6fe00907618] exc_invalid_op at ffffffffa1d25684
- #7 [ffffb6fe00907630] asm_exc_invalid_op at ffffffffa1e00af6
- [exception RIP: irq_fpu_usable+57]
- RIP: ffffffffa1232d29 RSP: ffffb6fe009076e8 RFLAGS: 00010006
- RAX: ffffffffa30c6794 RBX: ffffffffa30c6740 RCX: 0000000000000024
- RDX: 0000000080110002 RSI: ffffffffa30c6770 RDI: ffffffffa30c6740
- RBP: ffffffffa30c6770 R8: 0000000000000004 R9: 0000000000000004
- R10: ffffffffa30c6794 R11: 00000000000319c0 R12: 0000000000000001
- R13: ffffffffa30c6740 R14: 0000000000000024 R15: fffffc6140046780
- ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
- #8 [ffffb6fe009076e8] blake2s_compress at ffffffffa12e5ae0
- #9 [ffffb6fe00907718] blake2s_final at ffffffffa176e391
- #10 [ffffb6fe00907738] extract_entropy.constprop.0 at ffffffffa18e2904
- #11 [ffffb6fe009077d8] crng_reseed.constprop.0 at ffffffffa18e2b25
- #12 [ffffb6fe00907818] crng_make_state at ffffffffa18e2dd9
- #13 [ffffb6fe00907848] _get_random_bytes at ffffffffa18e2e7c
- #14 [ffffb6fe009078f8] get_random_u32 at ffffffffa18e3129
- #15 [ffffb6fe00907918] allocate_slab at ffffffffa1577bea
- #16 [ffffb6fe00907960] ___slab_alloc at ffffffffa157b17f
- #17 [ffffb6fe00907a00] kmem_cache_alloc at ffffffffa157d194
- #18 [ffffb6fe00907a48] hot_file_update_file_status at ffffffffc0dd9d24 [async_memory_reclaim_for_cold_file_area]
- #19 [ffffb6fe00907aa8] kprobe_post_process at ffffffffa126ec65
- #20 [ffffb6fe00907ab8] kprobe_int3_handler at ffffffffa126ef44
- #21 [ffffb6fe00907ae8] do_int3 at ffffffffa1225b2b
- #22 [ffffb6fe00907af8] exc_int3 at ffffffffa1d25f51
- #23 [ffffb6fe00907b10] asm_exc_int3 at ffffffffa1e00b35
- #24 [ffffb6fe00907b98] copy_page_to_iter at ffffffffa1764d32
- #25 [ffffb6fe00907bf8] filemap_read at ffffffffa14d7b9a
- ..................
触发问题的函数流程是,kprobe捕捉内核函数copy_page_to_iter。在读文件时执行到该函数时,产生int3中断,在中断服务函数里最后执行自定义的hot_file_update_file_status()函数,在该函数里执行kmem_cache_alloc()分配一个自定义slab数据结构。而在分配slab时,执行allocate_slab->shuffle_freelist->get_random_int分配一个随机数,源码如下:
- static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
- {
- void *start;
- void *cur;
- void *next;
- unsigned long idx, pos, page_limit, freelist_count;
- if (page->objects < 2 || !s->random_seq)
- return false;
- freelist_count = oo_objects(s->oo);
- //分配随机数,这里触发内核告警
- pos = get_random_int() % freelist_count;
- ............
- next = next_freelist_entry(s, page, &pos, start, page_limit,freelist_count);
- set_freepointer(s, cur, next);
- ............
- }
为什么分配slab时要分配随机数呢?查资料说是为了使分配的slab obj地址随机,不容易被攻击,安全考虑。
把这个过程的函数流程再整理一下hot_file_update_file_status->kmem_cache_alloc->___slab_alloc->allocate_slab->shuffle_freelist->get_random_int->get_random_u32->_get_random_bytes->crng_make_state->crng_reseed->drain_entropy->extract_entropy->blake2s_final->__blake2s_final->blake2s_compress->kernel_fpu_begin->irq_fpu_usable。再看下最后的几个关键blake2s_compress函数调用:
- void blake2s_compress(struct blake2s_state *state, const u8 *block,
- size_t nblocks, const u32 inc)
- {
- kernel_fpu_begin();
- ..................
- kernel_fpu_end();
- }
- static inline void kernel_fpu_begin(void)
- {
- kernel_fpu_begin_mask(KFPU_MXCSR);
- }
- void kernel_fpu_begin_mask(unsigned int kfpu_mask)
- {
- preempt_disable();
- //这里触发内核告警
- WARN_ON_FPU(!irq_fpu_usable());
- WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
- this_cpu_write(in_kernel_fpu, true);
- ..................
- }
是irq_fpu_usable()函数返回false触发的crash,看下这个函数的源码
- bool irq_fpu_usable(void)
- {
- if (WARN_ON_ONCE(in_nmi()))
- return false;
- if (this_cpu_read(in_kernel_fpu))
- return false;
- if (!in_hardirq())
- return true;
- return !softirq_count();
- }
显然,in_nmi()、this_cpu_read(in_kernel_fpu)、softirq_count()大于0都可能会导致该函数返回false,得一一排查。
crash时的cpu是cpu3,很快查找cpu3的in_kernel_fpu这个per cpu 变量是0,
- crash> in_kernel_fpu
- PER-CPU DATA TYPE:
- bool in_kernel_fpu;
- PER-CPU ADDRESSES:
- [0]: ffff971e39e18ed8
- [1]: ffff971e39e58ed8
- [2]: ffff971e39e98ed8
- //cpu3的in_kernel_fpu 变量地址
- [3]: ffff971e39ed8ed8
- crash> rd 0xffff971e39ed8ed8
- ffff971e39ed8ed8: 0000000000000000 ........
in_nmi() 和 softirq_count() 得查看cpu3的preempt_count变量,是0x 80110003,如下:
- crash> __preempt_count
- PER-CPU DATA TYPE:
- int __preempt_count;
- PER-CPU ADDRESSES:
- [0]: ffff971e39e18f00
- [1]: ffff971e39e58f00
- [2]: ffff971e39e98f00
- //cpu3的preempt_count变量地址
- [3]: ffff971e39ed8f00
- crash> rd -x ffff971e39ed8f00
- ffff971e39ed8f00: 0000000080110003
然后查看in_nmi()和softirq_count(),全都来自preempt_count()函数。
- #define PREEMPT_NEED_RESCHED 0x80000000
- static __always_inline int preempt_count(void)
- {
- return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
- }
- #define PREEMPT_BITS 8
- #define PREEMPT_SHIFT 0
- #define SOFTIRQ_BITS 8
- #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)//8
- #define __IRQ_MASK(x) ((1UL << (x))-1)
- #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)//16
- #define HARDIRQ_BITS 4
- // ((1 << 4) - 1) << 16 0xF << 16
- #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
- //preempt_count 的 bit16~bit19
- #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
- //((1 << 8) - 1) << 8 0x7F << 8
- #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
- //preempt_count 的 bit8~bit15
- # define softirq_count() (preempt_count() & SOFTIRQ_MASK)
- #define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)//20
- #define NMI_BITS 4
- //((1 << 4) - 1) << 20 0xF << 20
- #define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
- //preempt_count 的 bit20~bit23
- #define nmi_count() (preempt_count() & NMI_MASK)
- #define in_nmi() (nmi_count())
cpu3的preempt_count()值是 0x80110003 , bit8~bit15 是0,bit16~bit19是1,bit20~bit23是1。即 in_nmi() 是1,in_hardirq()是1,softirq_count()是0。显然,问题根源竟然是 in_nmi() 是非0导致的。这个就是NMI计数,非0说明当前处于NMI中断。这怎么可能呢?怎么会处于NMI中断呢?什么是NMI中断,这个一般是当发生hard lock、cpu硬件级别错误时才会发生的不可屏蔽中断。可是这个问题场景只是我利用kprobe捕捉内核函数,然后产生int3中断,在中断服务函数里最后执行自定义的hot_file_update_file_status()函数,在该函数里执行kmem_cache_alloc()分配一个slab数据结构而已,这个普通场景怎么会触发NMI中断呢?
这个虚拟机是rocky 9.2 ,之前就出现过莫名奇妙的问题,怀疑内存数据有问题,难道这次又是内存数据有问题?当遇到莫名其妙的问题,就容易怀疑到内存有问题导致数据跳变、内存越界等导致?哪里会有这么多神奇的行为,还是先耐着性子看下源码吧。于是发现in_nmi()大于0是执行了 __nmi_enter()导致的,如下:
- #define __nmi_enter() \
- do { \
- lockdep_off(); \
- arch_nmi_enter(); \
- BUG_ON(in_nmi() == NMI_MASK); \
- __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \
- } while (0)
再进一步,发现int3中断里竟然在irqentry_nmi_enter()函数里执行了__nmi_enter,细节如下:
- irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
- {
- irqentry_state_t irq_state;
- irq_state.lockdep = lockdep_hardirqs_enabled();
- __nmi_enter();//令 in_nmi 加1
- lockdep_hardirqs_off(CALLER_ADDR0);
- lockdep_hardirq_enter();
- rcu_nmi_enter();
- instrumentation_begin();
- trace_hardirqs_off_finish();
- ftrace_nmi_enter();
- instrumentation_end();
- return irq_state;
- }
- DEFINE_IDTENTRY_RAW(exc_int3)
- {
- if (poke_int3_handler(regs))
- return;
- if (user_mode(regs)) {
- ……………
- } else {
- //令 in_nmi 加1
- irqentry_state_t irq_state = irqentry_nmi_enter(regs);
- instrumentation_begin();
- //执行kprobe 的函数
- if (!do_int3(regs))
- die("int3", regs, 0);
- instrumentation_end();
- //令 in_nmi 减1
- irqentry_nmi_exit(regs, irq_state);
- }
- }
- static bool do_int3(struct pt_regs *regs)
- {
- int res;
- #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
- if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
- SIGTRAP) == NOTIFY_STOP)
- return true;
- #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
- #ifdef CONFIG_KPROBES
- //执行kprobe 的函数
- if (kprobe_int3_handler(regs))
- return true;
- #endif
- res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP);
- return res == NOTIFY_STOP;
- }
简单说:从源码角度来说,每次kprobe触发的int3中断,在exc_int3()中执行kprobe hot_file_update_file_status函数前,都会先执行 irqentry_nmi_enter->__nmi_enter 令NMI中断加1。这样看到的,每次执行 hot_file_update_file_status->kmem_cache_alloc 函数分配slab 前,都会出发内核告警irq_fpu_usable。可实际测试是随机的?这是怎么回事?
我认为 hot_file_update_file_status->kmem_cache_alloc->___slab_alloc->allocate_slab->shuffle_freelist->get_random_int->get_random_u32->_get_random_bytes->crng_make_state->crng_reseed->drain_entropy->extract_entropy->blake2s_final->__blake2s_final->blake2s_compress->kernel_fpu_begin->irq_fpu_usable,这个流程并不是每次都执行到最后的 irq_fpu_usable ,中间有很多条件判断,但是一旦执行到irq_fpu_usable ,必然会触发内核告警。
简单说,kprobe的函数里,不能执行 kmem_cache_alloc()分配slab 内存。因为kprobe的init3中断会令NMI计数加1,然后执行kmem_cache_alloc()分配slab有概率执行到irq_fpu_usable,因为NMI计数大于0而触发内核告警。
怎么解决呢?
- 1:在执行 hot_file_update_file_status->kmem_cache_alloc 前,自己的源码里强制令nmi计数减1,但怕对系统会有不良影响。
- 2:自己实现一套简易的slab源码,不执行 get_random_int() 函数就行。
但是这个两个方案改动太大,研究源码后,其实有个很简单的方法,不用动内核。
禁止掉 allocate_slab random 功能,就不会执行 hot_file_update_file_status->kmem_cache_alloc->___slab_alloc->allocate_slab->shuffle_freelist->get_random_int 函数,具体改动如下:
- struct hot_cold_file_global
- {
- struct kmem_cache *file_stat_cachep;
- }
- struct hot_cold_file_global hot_cold_file_global_info;
- static void (*cache_random_seq_destroy)(struct kmem_cache *cachep);
- //获取内核cache_random_seq_destroy函数指针
- cache_random_seq_destroy = (void *)kallsyms_lookup_name_async("cache_random_seq_destroy");
- cache_random_seq_destroy(hot_cold_file_global_info.file_stat_cachep);
- //禁止分配 file_stat_cachep 这个slab时使用随机数功能,分配slab时就不会执行到shuffle_freelist->get_random_int函数
- cache_random_seq_destroy(hot_cold_file_global_info.file_stat_cachep);
当执行cache_random_seq_destroy()函数后,就令slab的random_seq为NULL
- void cache_random_seq_destroy(struct kmem_cache *cachep)
- {
- kfree(cachep->random_seq);
- cachep->random_seq = NULL;
- }
然后将来分配slab,执行到allocate_slab->shuffle_freelist时,直接返回false,不会再执行到get_random_int()函数。
- static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
- {
- void *start;
- void *cur;
- void *next;
- unsigned long idx, pos, page_limit, freelist_count;
- //直接return false
- if (page->objects < 2 || !s->random_seq)
- return false;
- freelist_count = oo_objects(s->oo);
- //分配随机数,这里触发crash
- pos = get_random_int() % freelist_count;
- ............
- }
这篇关于一次 irq_fpu_usable 内核报错排查总结的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!