https://zhuanlan.zhihu.com/p/264922917
eBPF是位于内核的一个解释器, 实现了自己的RISC指令集.
* R0 - return value from in-kernel function, and exit value for eBPF program
* R1 - R5 - arguments from eBPF program to in-kernel function
* R6 - R9 - callee saved registers that in-kernel function will preserve
* R10 - read-only frame pointer to access stack
struct bpf_insn {
__u8 code; /* opcode */
__u8 dst_reg:4; /* dest register */
__u8 src_reg:4; /* source register */
__s16 off; /* signed offset */
__s32 imm; /* signed immediate constant */
};
#define MAX_BPF_STACK 512
#define DST regs[insn->dst_reg]
#define SRC regs[insn->src_reg]
static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
{
u64 stack[MAX_BPF_STACK / sizeof(u64)];
u64 regs[MAX_BPF_REG], tmp;
static const void *jumptable[256] = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
/* 32 bit ALU operations */
[BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
[BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
[...]
};
#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })
/* ALU */
#define ALU(OPCODE, OP) \
ALU64_##OPCODE##_X: \
DST = DST OP SRC; \
CONT; \
ALU_##OPCODE##_X: \
DST = (u32) DST OP (u32) SRC; \
CONT; \
ALU64_##OPCODE##_K: \
DST = DST OP IMM; \
CONT; \
ALU_##OPCODE##_K: \
DST = (u32) DST OP (u32) IMM; \
CONT;
ALU(ADD, +)
ALU(SUB, -)
ALU(AND, &)
ALU(OR, |)
ALU(LSH, <<)
ALU(RSH, >>)
ALU(XOR, ^)
[...]
}
void bpf_int_jit_compile(struct bpf_prog *prog)
{
for (pass = 0; pass < 10; pass++) {
proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
[...]
}
}
Verifier主要分2步:
* Corresponding eBPF program may look like:
* BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR
* BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
* BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP
/* types of values stored in eBPF registers */
enum bpf_reg_type {
NOT_INIT = 0, /* nothing was written into register */
UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
PTR_TO_CTX, /* reg points to bpf_context */
CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
PTR_TO_MAP_VALUE, /* reg points to map element value */
PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
FRAME_PTR, /* reg == frame_pointer */
PTR_TO_STACK, /* reg == frame_pointer + imm */
CONST_IMM, /* constant integer value */
};
struct reg_state {
enum bpf_reg_type type;
union {
/* valid when type == CONST_IMM | PTR_TO_STACK */
int imm;
/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
* PTR_TO_MAP_VALUE_OR_NULL
*/
struct bpf_map *map_ptr;
};
};
enum bpf_stack_slot_type {
STACK_INVALID, /* nothing was stored in this stack slot */
STACK_SPILL, /* 1st byte of register spilled into stack */
STACK_SPILL_PART, /* other 7 bytes of register spill */
STACK_MISC /* BPF program wrote some data into this slot */
};
struct bpf_stack_slot {
enum bpf_stack_slot_type stype;
struct reg_state reg_st;
};
/* state of the program:
* type of all registers and stack info
*/
struct verifier_state {
struct reg_state regs[MAX_BPF_REG];
struct bpf_stack_slot stack[MAX_BPF_STACK];
};
static int check_stack_write(struct verifier_state *state, int off, int size,
int value_regno)
{
struct bpf_stack_slot *slot;
int i;
if (value_regno >= 0 &&
(state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
state->regs[value_regno].type == PTR_TO_STACK ||
state->regs[value_regno].type == PTR_TO_CTX)) {
/* register containing pointer is being spilled into stack */
if (size != 8) {
verbose("invalid size of register spill\n");
return -EACCES;
}
slot = &state->stack[MAX_BPF_STACK + off];
slot->stype = STACK_SPILL;
/* save register state */
slot->reg_st = state->regs[value_regno];
for (i = 1; i < 8; i++) {
slot = &state->stack[MAX_BPF_STACK + off + i];
slot->stype = STACK_SPILL_PART;
slot->reg_st.type = UNKNOWN_VALUE;
slot->reg_st.map_ptr = NULL;
}
} else {
/* regular write of data into stack */
for (i = 0; i < size; i++) {
slot = &state->stack[MAX_BPF_STACK + off + i];
slot->stype = STACK_MISC;
slot->reg_st.type = UNKNOWN_VALUE;
slot->reg_st.map_ptr = NULL;
}
}
return 0;
}
static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
{
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
if (memcmp(&old->regs[i], &cur->regs[i],
sizeof(old->regs[0])) != 0) {
if (old->regs[i].type == NOT_INIT ||
old->regs[i].type == UNKNOWN_VALUE)
continue;
return false;
}
}
for (i = 0; i < MAX_BPF_STACK; i++) {
if (memcmp(&old->stack[i], &cur->stack[i],
sizeof(old->stack[0])) != 0) {
if (old->stack[i].stype == STACK_INVALID)
continue;
return false;
}
}
return true;
}
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
/*
* Assertion for 32 bit to make sure last 8 byte access
* (BPF_DW) to the last 4 byte member is disallowed.
*/
if (off + size > sizeof(struct pt_regs))
return false;
return true;
}
static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
/* check args */
err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
if (err)
return err;
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
[...]
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
.func = bpf_trace_printk,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_CONST_SIZE,
};
static const struct bpf_func_proto *
tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
case BPF_FUNC_probe_write_user:
return bpf_get_probe_write_proto();
[...]
}
}
struct task_struct* t = (struct task_struct*)bpf_get_current_task();
t->cpu = 111;
({ typeof(unsigned int) _val; __builtin_memset(&_val, 0, sizeof(_val)); bpf_probe_read(&_val, sizeof(_val), (u64)&t->cpu); _val; }) = 111;
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^
commit 2541517c32b
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1134,11 +1134,15 @@ static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct ftrace_event_call *call = &tk->tp.call;
+ struct bpf_prog *prog = call->prog;
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
+ if (prog && !trace_call_bpf(prog, regs))
+ return;
+
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return;
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
ret = BPF_PROG_RUN(prog, ctx);
[...]
}