ebpf maps 内核态代码分析
创建maps BPF_MAP_CREATE
map = find_and_alloc_map(attr); //根据map类型创建map对象
if (IS_ERR(map))
return PTR_ERR(map);
err = bpf_obj_name_cpy(map->name, attr->map_name);
if (err)
goto free_map_nouncharge;
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
err = security_bpf_map_alloc(map);
if (err)
goto free_map_nouncharge;
err = bpf_map_charge_memlock(map);
if (err)
goto free_map_sec;
err = bpf_map_alloc_id(map); //将map存放在idr数据结构中
if (err)
goto free_map;
err = bpf_map_new_fd(map, f_flags); //将map映射成fd文件返回给用户态的fd文件描述符
if (err < 0) {
/* failed to allocate fd.
* bpf_map_put() is needed because the above
* bpf_map_alloc_id() has published the map
* to the userspace and the userspace may
* have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
*/
bpf_map_put(map);
return err;
}
trace_bpf_map_create(map, err);
return err; //返回文件描述符
查找更新和删除,没什么好说的,代码写的很清楚。代码在kernel/bpf/syscall.c文件中,入口函数分别是map_lookup_elem map_update_elem map_delete_elem。
比较关键是,内核创建的map 对象怎么跟ebpf程序关联起来。这个关联在ebpf程序加载的时候进行的。如果代码时bpf_prog_load函数
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog;
prog->aux->load_time = ktime_get_boot_ns();
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
if (err)
goto free_prog;
/* run eBPF verifier */
err = bpf_check(&prog, attr); //这个函数对ebpf程序进行检查并将与fd关联的map对象替换到ebpf程序中
if (err < 0)
goto free_used_maps;
/* eBPF program is ready to be JITed */
if (!prog->bpf_func)
prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
goto free_used_maps;
err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;
err = bpf_prog_new_fd(prog);
关键函数是replace_map_fd_with_map_ptr.
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
if (bpf_prog_is_dev_bound(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env);
if (ret)
goto err_unlock;
}
ret = replace_map_fd_with_map_ptr(env); //将文件描述符替换成map指针地址
if (ret < 0)
goto skip_full_check;
env->explored_states = kcalloc(env->prog->len,
sizeof(struct bpf_verifier_state_list *),
GFP_USER);
ret = -ENOMEM;
if (!env->explored_states)
goto skip_full_check;
env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
ret = check_cfg(env);
replace_map_fd_with_map_ptr函数中可以看到会获取文件描述符,并根据文件描述符获取文件,最后获取文件的private_data即 map的地址。
f = fdget(insn->imm);
map = __bpf_map_get(f); //获取map对象地址
if (IS_ERR(map)) {
verbose(env, "fd %d is not pointing to valid bpf_map\n",
insn->imm);
return PTR_ERR(map);
}
err = check_map_prog_compatibility(env, map, env->prog);
if (err) {
fdput(f);
return err;
}
/* store map pointer inside BPF_LD_IMM64 instruction */
insn[0].imm = (u32) (unsigned long) map;
insn[1].imm = ((u64) (unsigned long) map) >> 32; //将地址赋值到insn中,这样ebpf虚拟机可以直接访问map。
/* check whether we recorded this map already */
for (j = 0; j < env->used_map_cnt; j++)
if (env->used_maps[j] == map) {
fdput(f);
goto next_insn;
}
if (env->used_map_cnt >= MAX_USED_MAPS) {
fdput(f);
return -E2BIG;
}
至此, ebpf map 在用户态和内核传递数据的方式基本上已经搞情况了。