分类
未分类

通过dlopen加载动态库,注册的四种方式。

看内核代码,发现在内核在加载module时将 全局的结构体变量注册到链表中有以下几种方式。总结以下
第一种 方式,在so模块中专门写一个函数用来将本模块中的全局变量注册到链表中。例子

#include <stdio.h>
#include "main.h"
static void print_func()
{
    printf("%s\n","test_so1");
}
static struct so_funcs my_func = {
    .name="test_so1",
    .do_func=print_func,
};
void init_test_so1()
{
    test_funcs[0]=&my_func;
}

方式, 是dlopen打开,以后调用dlsym 找到 init_test_so1函数地址,并执行这个函数。
第二种方式,通过constructor attribute,在加载so时自动执行init函数,代码如下

#include <stdio.h>
#include "main.h"
static void print_func()
{
    printf("%s\n","test_so2");
}
static struct so_funcs my_func = {
    .name="test_so2",
    .do_func=print_func,
};
void __attribute__((constructor)) init_test_so2()
{
    test_funcs[1]=&my_func;
}

由于有attribute((constructor)) ,dlopen在打开so库时,会自动执行init_test_so2函数将全局变量注册到链表中。
第三种方式,与第一中方式类似,第一种方式通过dlsym找到函数符号地址,执行函数,第三种找到符号变量,然后注册到链表中,在查找符号变量时 有两种方式,第一种,通过符号变量名查找变量地址,第二种通过将变量放在特殊的section中,通过section查找地址。例子如下

#include <stdio.h>
#include "main.h"
static void print_func()
{
    printf("%s\n","test_so3");
}
static struct so_funcs my_func = {
    .name="test_so3",
    .do_func=print_func,
};
static struct so_funcs __attribute__((section(".my_sections"))) *module_func = &my_func;//放在特殊的section中 my_sections.在主程序中会利用这个查找
struct so_funcs __attribute__((section(".my_sections"))) *module_func1 = &my_func;

main函数如下

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <link.h>
#include <dlfcn.h>
#include "main.h"
#define test1_path "./libtest_so1.so"
#define test2_path "./libtest_so2.so"
#define test3_path "./libtest_so3.so"
struct so_funcs *test_funcs[3];
void analysis_test3(void *test3)
{
    struct link_map *map;
    dlinfo(test3, RTLD_DI_LINKMAP, &map);
    printf("%p: %s: %p\n", (void *)map->l_addr, map->l_name, map->l_ld);
    test_funcs[2] = *(struct so_funcs **)((unsigned long)(map->l_ld)+(unsigned long)(0x1060-0xe18)); //注意 这个地方的0x1060 和0xe18是通过objdump -h libtest_so3.so查到的。可以做到自动化查询,这只是一个简单的例子没有做。
    printf("----------%p\n",test_funcs[2]);

}
int main()
{
    void *test1_handle=NULL;
    void *test2_handle=NULL;
    void *test3_handle=NULL;
    void (*register_func)();
    int i=0;
    for (i=0;i<3;i++)
        test_funcs[i]=NULL;
    test1_handle = dlopen(test1_path,RTLD_LAZY|RTLD_GLOBAL);
    printf("address = %p\n",test1_handle);
    printf("error %s\n",dlerror());
    test2_handle = dlopen(test2_path,RTLD_LAZY|RTLD_GLOBAL);
    printf("address = %p\n",test2_handle);
    printf("error %s\n",dlerror());
    test3_handle = dlopen(test3_path,RTLD_LAZY|RTLD_GLOBAL);
    printf("address = %p\n",test3_handle);
    printf("error %s\n",dlerror());
    register_func=(void(*)())dlsym(test1_handle,"init_test_so1");
    printf("address is %p\n",register_func);
    printf("error %s\n",dlerror());
    register_func();
    analysis_test3(test3_handle);
    for(i=0;i<3;i++){
        if(test_funcs[i]!=NULL){
            printf("funcs %s is run\n",test_funcs[i]->name);
            test_funcs[i]->do_func();
        }else{
            printf("funcs %d is NULL\n",i);
        }
    }
    struct so_funcs *test33=*(struct so_funcs **)dlsym(test3_handle,"module_func1");
    printf("last address %p\n",test33);
    return 0;
}

main.c文件非常,唯一需要注意的就是如何查找 指定section的首地址的问题,本例子简单的使用了 dynamic section 偏移和my_sections偏移,以及指导dynamic内存中的地址后计算的。现在是通过手工计算出来的,可以做成自动化。
Makefile 如下

all:libtest_so1.so libtest_so2.so libtest_so3.so
    gcc -rdynamic -o main main.c -ldl 

libtest_so1.so:test_so1.c
    gcc -c -fPIC test_so1.c
    gcc test_so1.o -shared -o libtest_so1.so

libtest_so2.so:test_so2.c
    gcc -c -fPIC test_so2.c
    gcc test_so2.o -shared -o libtest_so2.so

libtest_so3.so:test_so3.c
    gcc -c -fPIC test_so3.c
    gcc test_so3.o -shared -o libtest_so3.so
clean:
    rm -rf *.o *.so main

注意在编译main 程序时必须加上rdynamic 否则在调用dlopen时会报错找不到 main.c中定义的全局变量

分类
未分类

kernel trace_event 宏展开注解

要理解kernel的trace_event机制,最好的方式是读懂 samples/trace_events/trace_events_sample的例子代码。这个例子的代码非常难以读懂。我是按照以下三个阶段读懂的。怎么申明和定义的,怎么注册到系统中的,怎么使能的。
在这个例子里面有一个非常有用的宏,这一个宏展开即完成了trace_event的申明和定义,看例子时,我们只看foo_bar这个trace_event的流程。
首先,我们需要看trace-events-sample.h这个头文件,这个头文件比较怪异。可以被多次包含。在头文件的最后还包含了另外一个头文件trace/define_trace.h。包含这个头文件是魔法开始的地方。
下面开始分析trace-events-sample.h头文件
trace-events-sample.h头文件在开始linux/tracepoint.h头文件,注意这个头文件只能被包含一次。不能重复包含 被 ifndef endif保护。
紧接着写了一个宏如下。

TRACE_EVENT(foo_bar,

        TP_PROTO(const char *foo, int bar, const int *lst,
                 const char *string, const struct cpumask *mask),

        TP_ARGS(foo, bar, lst, string, mask),

        TP_STRUCT__entry(
                __array(        char,   foo,    10              )
                __field(        int,    bar                     )
                __dynamic_array(int,    list,   __length_of(lst))
                __string(       str,    string                  )
                __bitmask(      cpus,   num_possible_cpus()     )
        ),

        TP_fast_assign(
                strlcpy(__entry-&gt;foo, foo, 10);
                __entry-&gt;bar    = bar;
                memcpy(__get_dynamic_array(list), lst,
                       __length_of(lst) * sizeof(int));
                __assign_str(str, string);
                __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus());
        ),

        TP_printk(&quot;foo %s %d %s %s %s %s (%s)&quot;, __entry-&gt;foo, __entry-&gt;bar,

/*
 * Notice here the use of some helper functions. This includes:
 *
 *  __print_symbolic( variable, { value, &quot;string&quot; }, ... ),
 *
 *    The variable is tested against each value of the { } pair. If
 *    the variable matches one of the values, then it will print the
 *    string in that pair. If non are matched, it returns a string
 *    version of the number (if __entry-&gt;bar == 7 then &quot;7&quot; is returned).
 */
                  __print_symbolic(__entry-&gt;bar,
                                   { 0, &quot;zero&quot; },
                                   { TRACE_SAMPLE_FOO, &quot;TWO&quot; },
                                   { TRACE_SAMPLE_BAR, &quot;FOUR&quot; },
                                   { TRACE_SAMPLE_ZOO, &quot;EIGHT&quot; },
                                   { 10, &quot;TEN&quot; }
                          ),

/*
 *  __print_flags( variable, &quot;delim&quot;, { value, &quot;flag&quot; }, ... ),
 *
 *    This is similar to __print_symbolic, except that it tests the bits
 *    of the value. If ((FLAG &amp; variable) == FLAG) then the string is
 *    printed. If more than one flag matches, then each one that does is
 *    also printed with delim in between them.
 *    If not all bits are accounted for, then the not found bits will be
 *    added in hex format: 0x506 will show BIT2|BIT4|0x500
 */
                  __print_flags(__entry-&gt;bar, &quot;|&quot;,
                                { 1, &quot;BIT1&quot; },
                                { 2, &quot;BIT2&quot; },
                                { 4, &quot;BIT3&quot; },
                                { 8, &quot;BIT4&quot; }
                          ),
/*
 *  __print_array( array, len, element_size )
 *
 *    This prints out the array that is defined by __array in a nice format.
 */
                  __print_array(__get_dynamic_array(list),
                                __get_dynamic_array_len(list) / sizeof(int),
                                sizeof(int)),
                  __get_str(str), __get_bitmask(cpus))
);

在linux/tracepoint.h头文件中对这个宏有定义,宏展开方式按照linux/tracepoint.h中定义的展开

#define TRACE_EVENT(name, proto, args, struct, assign, print)   \
        DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))

再看DECLARE_TRACE宏展开

#define DECLARE_TRACE(name, proto, args)                                \
        __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args),              \
                        cpu_online(raw_smp_processor_id()),             \
                        PARAMS(void *__data, proto),                    \
                        PARAMS(__data, args))

看 __DECLARE_TRACE展开

#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
        extern struct tracepoint __tracepoint_##name;                   \
        static inline void trace_##name(proto)                          \  //这个函数是在需要trace的地方插入,例如例子中在需要trace的地方写了trace_foo_bar
        {                                                               \
                if (static_key_false(&amp;__tracepoint_##name.key))         \
                        __DO_TRACE(&amp;__tracepoint_##name,                \
                                TP_PROTO(data_proto),                   \
                                TP_ARGS(data_args),                     \
                                TP_CONDITION(cond), 0);                 \
                if (IS_ENABLED(CONFIG_LOCKDEP) &amp;&amp; (cond)) {             \
                        rcu_read_lock_sched_notrace();                  \
                        rcu_dereference_sched(__tracepoint_##name.funcs);\
                        rcu_read_unlock_sched_notrace();                \
                }                                                       \
        }                                                               \
        __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args),          \
                PARAMS(cond), PARAMS(data_proto), PARAMS(data_args))    \
        static inline int                                               \
        register_trace_##name(void (*probe)(data_proto), void *data)    \ //注册探测函数,tracepoint会用到 trace_event中不用这个方式注册
        {                                                               \
                return tracepoint_probe_register(&amp;__tracepoint_##name,  \
                                                (void *)probe, data);   \
        }                                                               \
        static inline int                                               \
        register_trace_prio_##name(void (*probe)(data_proto), void *data,\
                                   int prio)                            \
        {                                                               \
                return tracepoint_probe_register_prio(&amp;__tracepoint_##name, \
                                              (void *)probe, data, prio); \
        }                                                               \
        static inline int                                               \
        unregister_trace_##name(void (*probe)(data_proto), void *data)  \
        {                                                               \
                return tracepoint_probe_unregister(&amp;__tracepoint_##name,\
                                                (void *)probe, data);   \
        }                                                               \
        static inline void                                              \
        check_trace_callback_type_##name(void (*cb)(data_proto))        \
        {                                                               \
        }                                                               \
        static inline bool                                              \
        trace_##name##_enabled(void)                                    \//使能探测,同样单独使用tracepoint时需要,在使用trace_event时不需要使用这个函数。
        {                                                               \
                return static_key_false(&amp;__tracepoint_##name.key);      \
        }
#define DEFINE_TRACE_FN(name, reg, unreg)                                \
        static const char __tpstrtab_##name[]                            \
        __attribute__((section(&quot;__tracepoints_strings&quot;))) = #name;       \
        struct tracepoint __tracepoint_##name                            \  //tracepoint 结构体定义。
        __attribute__((section(&quot;__tracepoints&quot;))) =                      \
                { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\
        static struct tracepoint * const __tracepoint_ptr_##name __used  \
        __attribute__((section(&quot;__tracepoints_ptrs&quot;))) =                 \
                &amp;__tracepoint_##name;

#define DEFINE_TRACE(name)                                              \
        DEFINE_TRACE_FN(name, NULL, NULL);

以上都很常规,真正魔法开始的地方在下面

#define TRACE_INCLUDE_PATH .        //定义头文件路径,在define_trace.h中会用到
/*
 * TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal
 */
#define TRACE_INCLUDE_FILE trace-events-sample          //定义头文件名称,在define_trace.h中会用到
#include &lt;trace/define_trace.h&gt;           //包含trace/define_trace.h

trace/define_trace.h文件在开始就对TRACE_EVENT宏进行了重定义

#undef TRACE_EVENT
#define TRACE_EVENT(name, proto, args, tstruct, assign, print)  \
        DEFINE_TRACE(name)

接下来这一段如下,

#ifndef TRACE_INCLUDE_PATH
# define __TRACE_INCLUDE(system) &lt;trace/events/system.h&gt;
# define UNDEF_TRACE_INCLUDE_PATH
#else
# define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h)
#endif

# define TRACE_INCLUDE(system) __TRACE_INCLUDE(system)

/* Let the trace headers be reread */
#define TRACE_HEADER_MULTI_READ

#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)

这段代码的大意 就是重新包含TRACE_INCLUDE_FIELE也就是重新包含trace-events-sample.h,重新展开TRACE_EVENT宏。这段展开是tracepoint定义
define_trace.h包trace_events.h 这个文件是trace_event的核心。

#include &lt;trace/trace_events.h&gt;

在trace_events.h 头文件中,在一开始又重新定义了 TRACE_EVENT宏。

#undef TRACE_EVENT
#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
        DECLARE_EVENT_CLASS(name,                              \
                             PARAMS(proto),                    \
                             PARAMS(args),                     \
                             PARAMS(tstruct),                  \
                             PARAMS(assign),                   \
                             PARAMS(print));                   \
        DEFINE_EVENT(name, name, PARAMS(proto), PARAMS(args));

在剩下的代码中会数次重新定义DECLARE_EVENT_CLASS 和 DEFINE_EVENT宏,然后重新包含trace-events-sample.h。对TRACE_EVENT宏进行7次展开。中间的展开很简单就不一一说明了。
直接看最后一次展开。

 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)         \
 _TRACE_PERF_PROTO(call, PARAMS(proto));                                                \
 static char print_fmt_##call[] = print;                                                                \
 static struct trace_event_class __used __refdata event_class_##call = {                \
                .system                         = TRACE_SYSTEM_STRING,                      \
                .define_fields                  = trace_event_define_fields_##call,                 \
                .fields                         = LIST_HEAD_INIT(event_class_##call.fields),\
                .raw_init                           = trace_event_raw_init,                             \
                .probe                              = trace_event_raw_event_##call,             \
                .reg                                    = trace_event_reg,                                      \
                _TRACE_PERF_INIT(call)                                                                          \
 };

#undef DEFINE_EVENT
#define DEFINE_EVENT(template, call, proto, args)                       \
                                                                        \
static struct trace_event_call __used event_##call = {                  \
        .class                  = &amp;event_class_##template,              \
        {                                                               \
                .tp                     = &amp;__tracepoint_##call,         \
        },                                                              \
        .event.funcs            = &amp;trace_event_type_funcs_##template,   \
        .print_fmt              = print_fmt_##template,                 \
        .flags                  = TRACE_EVENT_FL_TRACEPOINT,            \
};                                                                      \
static struct trace_event_call __used                                   \
__attribute__((section(&quot;_ftrace_events&quot;))) *__event_##call = &amp;event_##call

这一次宏展开,定义了一个trace_event_call的结构体。并且定义编译时将结构体存放在section("_ftrace_events"))段。当模块被加载时,内核会读取该段的数据,获取trace_event_call结构体。然后将trace_event_call注册到系统中。 下篇介绍,如何注册及使能。

分类
未分类

kprobe机制分析

kprobe是linux内核提供的一种动态调试机制,通过这套机制,用户可以在执行指定代码前,先执行自己的代码。做一些统计,跟踪等工作。
想了解kprobe模块的工作机理,可以从内核源码的sample/kprobes/kprobes_example.c文件开始。
这个文件会被编译成内核的一个模块,我们从模块加载时执行的代码开始分析。

#define MAX_SYMBOL_LEN  64
static char symbol[MAX_SYMBOL_LEN] = "_do_fork";        //默认在执行_do_fork函数之前,执行用户插入的代码
module_param_string(symbol, symbol, sizeof(symbol), 0644);

/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
        .symbol_name    = symbol,           //指定,在那个代码之前执行用户的代码
};

static int __init kprobe_init(void)
{
        int ret;
        kp.pre_handler = handler_pre;           //在执行当前代码之前执行的函数在例子中函数只是打印一些地址和符号,实际用户可以做很多事
        kp.post_handler = handler_post;         //在执行完当前代码执行的函数在例子中函数只是打印一些地址和符号。
        kp.fault_handler = handler_fault;       //如果出现错误执行的函数

        ret = register_kprobe(&kp);             //注册kp结构体。
        if (ret < 0) {
                pr_err("register_kprobe failed, returned %d\n", ret);
                return ret;
        }
        pr_info("Planted kprobe at %p\n", kp.addr);
        return 0;
}

register_kprobe函数注册kprobe结构到内核中,并替换内核指定位置的指令为int 3 指令,保存被替换的指令。当内核执行这个位置时,将被int 3 捕获,然后执行用户代码。
register_kprobe代码如下

int register_kprobe(struct kprobe *p)
{
        int ret;
        struct kprobe *old_p;
        struct module *probed_mod;
        kprobe_opcode_t *addr;

        /* Adjust probe address from symbol */
        addr = kprobe_addr(p);                  //这个函数主要是根据符号和偏移获取地址。
        if (IS_ERR(addr))
                return PTR_ERR(addr);
        p->addr = addr;

        ret = check_kprobe_rereg(p);            //检查是否已经注册过 ,如果注册过就直接返回 不注册了。
        if (ret)
                return ret;

        /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
        p->flags &= KPROBE_FLAG_DISABLED;
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);

        ret = check_kprobe_address_safe(p, &probed_mod);    //检查地址是否可以被probe,如果不能也返回
        if (ret)
                return ret;

        mutex_lock(&kprobe_mutex);

        old_p = get_kprobe(p->addr);            //检查地址是否已经被其他kprobe替换过了。如果有,走register_aggr_kprobe逻辑
        if (old_p) {
                /* Since this may unoptimize old_p, locking text_mutex. */
                ret = register_aggr_kprobe(old_p, p);   //如果地址已经被kprobe,则只需要将新的kprobe挂在老的kprobe后面,顺序执行即可
                goto out;
        }

        cpus_read_lock();
        /* Prevent text modification */
        mutex_lock(&text_mutex);
        ret = prepare_kprobe(p);                //将当前指令拷贝到kprobe结构中,
        mutex_unlock(&text_mutex);
        cpus_read_unlock();
        if (ret)
                goto out;

        INIT_HLIST_NODE(&p->hlist);
        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);     //将kprobe结构添加到全局变量中

        if (!kprobes_all_disarmed && !kprobe_disabled(p)) {         //判断是否注册即生效,如果生效调用arm_kprobe函数,替换内核指令
                ret = arm_kprobe(p);                        //将内核代码空间指定的地址,替换成int 3 指令 在x86架构中指令为0xcc。
                if (ret) {
                        hlist_del_rcu(&p->hlist);
                        synchronize_sched();
                        goto out;
                }
        }

        /* Try to optimize kprobe */
        try_to_optimize_kprobe(p);                  //这个函数是为了后续有probe同样地址的kprobe准备的。
out:
        mutex_unlock(&kprobe_mutex);

        if (probed_mod)
                module_put(probed_mod);

        return ret;
}

当注册完成,且内核地址被替换完成,当内核运行时,运行到被替换的代码时将被int3 捕获,执行int3的代码。
int3的代码是arch/x86/kernel/traps.c文件中的do_int3函数。中间有一句话 kprobe_int3_handle(regs)。
函数中关键代码如下

          p = get_kprobe(addr);             //根据地址查找对应的kprobe结构体

          if (p) {
                  if (kprobe_running()) {
                          if (reenter_kprobe(p, regs, kcb))
                                  return 1;
                  } else {
                          set_current_kprobe(p, regs, kcb);
                          kcb->kprobe_status = KPROBE_HIT_ACTIVE;

                          /*
                           * If we have no pre-handler or it returned 0, we
                           * continue with normal processing.  If we have a
                           * pre-handler and it returned non-zero, it prepped
                           * for calling the break_handler below on re-entry
                           * for jprobe processing, so get out doing nothing
                           * more here.
                           */
                          if (!p->pre_handler || !p->pre_handler(p, regs))      //执行结构体中pre_handler函数,即例子中的handler_pre 函数
                                  setup_singlestep(p, regs, kcb, 0);
                          return 1;
                  }

由此可以看出kprobe的工作流程如下,注册时,将kprobe结构挂全局链表中,然后将想要kprobe的指令替换成int 3 指令,当执行到这个指令时,会执行do_int3函数,最终调用kprobe_int3_handle执行用户代码。
kprobe本身功能简单,但是在trace 框架下,它能发挥巨大的作用。

分类
未分类

linux 内核poll机制分析

poll和select算是姐妹篇吧。poll没有 select那么多限制。但是就代码上看,poll每次用户态和内核态传递的数据比select更多,事件机制没有显著变化。我猜poll是后来epoll机制的先导。
系统调用接口

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                int, timeout_msecs)
{
        struct timespec64 end_time, *to = NULL;
        int ret;

        if (timeout_msecs >= 0) {
                to = &end_time;
                poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
                        NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));        //超时时间设置
        }

        ret = do_sys_poll(ufds, nfds, to);              //干活函数。

        if (ret == -EINTR) {
                struct restart_block *restart_block;

                restart_block = &current->restart_block;
                restart_block->fn = do_restart_poll;
                restart_block->poll.ufds = ufds;
                restart_block->poll.nfds = nfds;

                if (timeout_msecs >= 0) {
                        restart_block->poll.tv_sec = end_time.tv_sec;
                        restart_block->poll.tv_nsec = end_time.tv_nsec;
                        restart_block->poll.has_timeout = 1;
                } else
                        restart_block->poll.has_timeout = 0;

                ret = -ERESTART_RESTARTBLOCK;
        }
        return ret;
}

do_sys_poll函数,这个函数也是预处理函数,将用户态的数据拷贝到内核,然后调用do_poll函数工作,等do_poll返回以后再将处理完的数据拷贝到用户态。比较有意思时,它采用的块处理,每个walk块一个page。当最后的块不够page时,只分配块那么长的内存。代码同样非常简单。稍微看看就懂,不列出了。

do_poll函数,这个函数,是真正干活的函数,由于是select的姐妹篇,所以在超时设置,句柄查询方式方面跟select没有区别不废话,直接上代码。

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                   struct timespec64 *end_time)
{
        poll_table* pt = &wait->pt;
        ktime_t expire, *to = NULL;
        int timed_out = 0, count = 0;
        u64 slack = 0;
        unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;   //是否忙查
        unsigned long busy_start = 0;

        /* Optimise the no-wait case */
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                pt->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);     //超时处理

        for (;;) {      //外层循环,退出机制跟select一样,超时,文件句柄有事件,有信号
                struct poll_list *walk;
                bool can_busy_loop = false;

                for (walk = list; walk != NULL; walk = walk->next) {    //第二层训话,按句柄块处理
                        struct pollfd * pfd, * pfd_end;

                        pfd = walk->entries;
                        pfd_end = pfd + walk->len;
                        for (; pfd != pfd_end; pfd++) {     //单个句柄处理
                                /*
                                 * Fish for events. If we found one, record it
                                 * and kill poll_table->_qproc, so we don't
                                 * needlessly register any other waiters after
                                 * this. They'll get immediately deregistered
                                 * when we break out and return.
                                 */
                                if (do_pollfd(pfd, pt, &can_busy_loop,              //查询句柄是否有事件的函数
                                              busy_flag)) {
                                        count++;
                                        pt->_qproc = NULL;
                                        /* found something, stop busy polling */
                                        busy_flag = 0;
                                        can_busy_loop = false;
                                }
                        }
                }
                /*
                 * All waiters have already been registered, so don't provide
                 * a poll_table->_qproc to them on the next loop iteration.
                 */
                pt->_qproc = NULL;
                if (!count) {
                        count = wait->error;
                        if (signal_pending(current))
                                count = -EINTR;
                }
                if (count || timed_out)
                        break;

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) { //跟select一致
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))    //跟select一致。
                        timed_out = 1;
        }
        return count;
}

do_pollfd函数。查询单个文件句柄是否有事件,上代码。

static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
                                     bool *can_busy_poll,
                                     unsigned int busy_flag)
{
        unsigned int mask;
        int fd;

        mask = 0;
        fd = pollfd->fd;            //获取文件句柄
        if (fd >= 0) {
                struct fd f = fdget(fd);    //获取句柄对应的文件
                mask = POLLNVAL;
                if (f.file) {
                        mask = DEFAULT_POLLMASK;
                        if (f.file->f_op->poll) {
                                pwait->_key = pollfd->events|POLLERR|POLLHUP;
                                pwait->_key |= busy_flag;
                                mask = f.file->f_op->poll(f.file, pwait);       //调用文件对应的poll函数查询状态
                                if (mask & busy_flag)
                                        *can_busy_poll = true;
                        }
                        /* Mask out unneeded events. */
                        mask &= pollfd->events | POLLERR | POLLHUP;         //出去不需要的状态,查询需要的状态。
                        fdput(f);
                }
        }
        pollfd->revents = mask;             //保存查询的状态

        return mask;        //返回查询的状态
}

poll函数比select函数稍微改进是,poll函数没有文件句柄数量的限制,且poll函数也不会只能监听奇葩的句柄号是1024以下的文件。但是poll并不完美,每次调用poll函数都需要将所有的文件句柄下传到内核态。即使这次监听只有一个文件有事件,下次监听时也需要将所有文件句柄都下传,而且不只是下传哦,当有结果的时候内核还要上传的哦,即使只有一个文件有事件,也需要把所有状态都上传。当句柄足够多的时候,poll函数本身非常消耗cpu。因为它在内核时需要挨个轮询,当返回用户态以后还需要挨个轮询,两遍轮询存在性能瓶颈。时效性也有问题,如果出现事件的句柄在下传句柄的最后,则有可能消息处理不及时。鉴于poll函数有如此多的缺陷。开源社区的老大们开发了epoll系统调用。epoll几乎完美的解决了上述问题。因此在高负载情况下,大多数都是用epoll系统调用。

分类
未分类

linux 内核select机制分析

刚开始接触select函数的时候,没有觉得 这个机制有多厉害,自从上次看了quagga源代码以后,第一次感觉到select 这么厉害,居然能够用单线程非常好的处理多源头消息(定时任务,网络消息,进程通讯消息等)。当然,唯一的要求就是 单次处理的消息时间不能太长,消息不能非常频繁,消息对时效性要求不高。当程序满足以上条件时,即可使用select加单线程来处理。既减少了线程同步的麻烦又能同时处理多种类型的消息。
系统调用接口分析

SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, //n所有句柄中最大值,最大不能超过1024,
                fd_set __user *, exp, struct timeval __user *, tvp) //inp监控读句柄,outp监控写句柄,exp监控异常句柄,tvp超时时间
{
        struct timespec64 end_time, *to = NULL;
        struct timeval tv;
        int ret;

        if (tvp) {
                if (copy_from_user(&tv, tvp, sizeof(tv)))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to,
                                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
                        return -EINVAL;
        }

        ret = core_sys_select(n, inp, outp, exp, to);       //调用系统调用,并将由消息的句柄重新回填到inp,outp,exp内存中。
        ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);

        return ret;
}

core_sys_select函数,主要是对inp,outp,exp中的数据进行预处理,最后调用do_select函数,去轮询文件句柄对应的文件的状态。在用户态中,已经将文件句柄转换成bitmap。在内核中需要将这些bitmap拷贝到内核空间。并为select的结果分配空间。这中间的代码很简单。所谓有点意思是,在栈空间首先预分配了一个空间,只有当预分配的空间不够时,才会低啊用kvmalloc重新分配新的空间。可以看出内核在最求性能方面。一直在最求极致。最后当do_select函数执行完成。core_sys_select函数将do_select函数执行的结果。拷贝到用户态的内存中。返回给用户,同样使用的也是bitmap表示,有消息的句柄对应的bitmap被置位。
do_select函数分析,这个函数是,整个系统调用的核心,废话不多说,直接上代码

static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
        ktime_t expire, *to = NULL;
        struct poll_wqueues table;
        poll_table *wait;
        int retval, i, timed_out = 0;
        u64 slack = 0;
        unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;   //判断是否一直轮询,如果一直轮序则不休息。
        unsigned long busy_start = 0;

        rcu_read_lock();
        retval = max_select_fd(n, fds);
        rcu_read_unlock();

        if (retval < 0)
                return retval;
        n = retval;

        poll_initwait(&table);
        wait = &table.pt;
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                wait->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);         //如果有设置超时计算超时时间。

        retval = 0;
        for (;;) {          //一直循环,直到退出,退出体检时,超时,信号中断,文件句柄有事件
                unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
                bool can_busy_loop = false;

                inp = fds->in; outp = fds->out; exp = fds->ex;
                rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

                for (i = 0; i < n; ++rinp, ++routp, ++rexp) {       //第二层循环,按bytes 轮询
                        unsigned long in, out, ex, all_bits, bit = 1, mask, j;
                        unsigned long res_in = 0, res_out = 0, res_ex = 0;

                        in = *inp++; out = *outp++; ex = *exp++;
                        all_bits = in | out | ex;
                        if (all_bits == 0) {
                                i += BITS_PER_LONG;
                                continue;
                        }

                        for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {       //第三层循环 按bit轮询
                                struct fd f;
                                if (i >= n)
                                        break;
                                if (!(bit & all_bits))
                                        continue;
                                f = fdget(i);
                                if (f.file) {
                                        const struct file_operations *f_op;
                                        f_op = f.file->f_op;
                                        mask = DEFAULT_POLLMASK;
                                        if (f_op->poll) {
                                                wait_key_set(wait, in, out,
                                                             bit, busy_flag);
                                                mask = (*f_op->poll)(f.file, wait); //查询对应的文件句柄是否有事件,wait->_qproc为空只是查询状态。
                                        }
                                        fdput(f);
                                        if ((mask & POLLIN_SET) && (in & bit)) {    //如果有事件,设置对应的bitmap
                                                res_in |= bit;
                                                retval++;
                                                wait->_qproc = NULL;
                                        }
                                        if ((mask & POLLOUT_SET) && (out & bit)) {  //如果有事件,设置对应的bitmap
                                                res_out |= bit;
                                                retval++;
                                                wait->_qproc = NULL;
                                        }
                                        if ((mask & POLLEX_SET) && (ex & bit)) {    //如果有事件,设置对应的bitmap
                                                res_ex |= bit;
                                                retval++;
                                                wait->_qproc = NULL;
                                        }
                                        /* got something, stop busy polling */
                                        if (retval) {
                                                can_busy_loop = false;
                                                busy_flag = 0;

                                        /*
                                         * only remember a returned
                                         * POLL_BUSY_LOOP if we asked for it
                                         */
                                        } else if (busy_flag & mask)
                                                can_busy_loop = true;

                                }
                        }
                        if (res_in)
                                *rinp = res_in;     //拷贝,句柄bitmap
                        if (res_out)
                                *routp = res_out;   //拷贝,句柄bitmap
                        if (res_ex)
                                *rexp = res_ex;     //拷贝,句柄bitmap
                        cond_resched();
                }
                wait->_qproc = NULL;
                if (retval || timed_out || signal_pending(current))
                        break;
                if (table.error) {
                        retval = table.error;
                        break;
                }

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) {     //如果是一直忙等,判断是否需要放弃cpu,忙等时间过长,需要给别的进程cpu
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
                                           to, slack))  //当设置没有忙等时,用这个函数来放弃cpu。这个函数,时钟中断都可以唤醒,如果没有超时,则不设置timed_out否则设置time_out并在轮询一次。
                        timed_out = 1;
        }

        poll_freewait(&table);
        return retval;
}

该函数填充fds数据结构中的res_* 变量,表示查询到已经有事件的文件句柄
core_sys_select函数最终将结果拷贝到用户态内存空间,供用户程序使用。

注意:select系统调用中限制最大文件句柄数量为1024个,文件句柄号最大值是1024,文件句柄号超过1024就不能被select,即使只有一个句柄,但是句柄号是1025,select也不会监控它。

分类
linux file system

linux eventfd详解

最开始听说eventfd的时候是在virtio vhost驱动那块听说的。以为很高深的一个技术,没有细看。
最近在了解select poll epoll的时候有看到这个东西,于是决定好好看看eventfd这个模块的代码。
代码路径:fs/eventfd.c
系统调用:eventfd1 eventfd2
使用逻辑:首先程序调用eventfd*系统调用,创建eventfd文件描述符。然后通过对文件描述符的read和write来做进程间通讯。由于eventfd模块的文件操作指针中有poll函数因此可以使用select poll epoll来等待。逻辑简单。具体看代码分析。
关键数据结构

struct eventfd_ctx {
        struct kref kref;           //对象引用计数
        wait_queue_head_t wqh;      //等待队列首地址
        /*
         * Every time that a write(2) is performed on an eventfd, the
         * value of the __u64 being written is added to "count" and a
         * wakeup is performed on "wqh". A read(2) will return the "count"
         * value to userspace, and will reset "count" to zero. The kernel
         * side eventfd_signal() also, adds to the "count" counter and
         * issue a wakeup.
         */
        __u64 count;                //read write 操作的计数
        unsigned int flags;         //eventfd 文件特性描述
};

创建eventfd文件描述符

SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
{
        int fd, error;
        struct file *file;

        error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);    //获取空闲描述符
        if (error < 0)
                return error;
        fd = error;

        file = eventfd_file_create(count, flags);           //创建eventfd文件对象
        if (IS_ERR(file)) {            
                error = PTR_ERR(file);
                goto err_put_unused_fd;
        }
        fd_install(fd, file);                           //文件描述符和eventfd文件对象关联

        return fd;                              

err_put_unused_fd:
        put_unused_fd(fd);

        return error;
}

struct file *eventfd_file_create(unsigned int count, int flags)
{
        struct file *file;
        struct eventfd_ctx *ctx;

        /* Check the EFD_* constants for consistency.  */
        BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);

        if (flags & ~EFD_FLAGS_SET)
                return ERR_PTR(-EINVAL); 

        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);        //分配eventfd文件私有数据结构内存。
        if (!ctx)
                return ERR_PTR(-ENOMEM);

        kref_init(&ctx->kref);
        init_waitqueue_head(&ctx->wqh);
        ctx->count = count;
        ctx->flags = flags;

        file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
                                  O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));//创建匿名inode 文件,即创建一个文件,但是这个文件的挂载点是匿名的。表明这个文件只是内存中存在,硬盘中不存在。一个虚拟文件系统
        if (IS_ERR(file))
                eventfd_free_ctx(ctx);

        return file;
}

eventfd_fops数据如下

static const struct file_operations eventfd_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo    = eventfd_show_fdinfo,
#endif
        .release        = eventfd_release,
        .poll           = eventfd_poll,         //文件poll操作,给select epoll使用
        .read           = eventfd_read,         //文件读操作
        .write          = eventfd_write,        //文件写操作
        .llseek         = noop_llseek,
};

至此,虚拟evnetfd 文件已经创建完,对应的文件描述符也返回给用户程序。

现在分析 对改文件的读写操作。首先是写操作

static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
                             loff_t *ppos)
{
        struct eventfd_ctx *ctx = file->private_data;
        ssize_t res;
        __u64 ucnt;
        DECLARE_WAITQUEUE(wait, current);               //初始化等待变量

        if (count < sizeof(ucnt))
                return -EINVAL;
        if (copy_from_user(&ucnt, buf, sizeof(ucnt)))   //拷贝要写入的值大小
                return -EFAULT;
        if (ucnt == ULLONG_MAX)
                return -EINVAL;
        spin_lock_irq(&ctx->wqh.lock);                  //自旋锁,禁止中断
        res = -EAGAIN;
        if (ULLONG_MAX - ctx->count > ucnt)
                res = sizeof(ucnt);
        else if (!(file->f_flags & O_NONBLOCK)) {       //判断是否阻塞,如果不阻塞跳过等待可写状态代码。
                __add_wait_queue(&ctx->wqh, &wait);     //将wait添加到等待队列中
                for (res = 0;;) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        if (ULLONG_MAX - ctx->count > ucnt) {
                                res = sizeof(ucnt);
                                break;
                        }
                        if (signal_pending(current)) {
                                res = -ERESTARTSYS;
                                break;
                        }
                        spin_unlock_irq(&ctx->wqh.lock);
                        schedule();                     //切换到其他进程执行,当前进程挂起
                        spin_lock_irq(&ctx->wqh.lock);
                }
                __remove_wait_queue(&ctx->wqh, &wait);  //进程不需要等待,醒来。
                __set_current_state(TASK_RUNNING);      //设置当前进程为运行状态
        }
        if (likely(res > 0)) {
                ctx->count += ucnt;                     //写入数据
                if (waitqueue_active(&ctx->wqh))        //判断等待队列是否为空
                        wake_up_locked_poll(&ctx->wqh, POLLIN);     //唤醒等待读的进程。
        }
        spin_unlock_irq(&ctx->wqh.lock);                //解除自旋锁

        return res;             //返回写入的字节数
}

wake_up_locked_poll 函数函数体

static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
                        int nr_exclusive, int wake_flags, void *key,
                        wait_queue_entry_t *bookmark)
{
        wait_queue_entry_t *curr, *next;
        int cnt = 0;

        if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) {
                curr = list_next_entry(bookmark, entry);

                list_del(&bookmark->entry);
                bookmark->flags = 0;
        } else
                curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry);

        if (&curr->entry == &wq_head->head)
                return nr_exclusive;

        list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) {
                unsigned flags = curr->flags;
                int ret;

                if (flags & WQ_FLAG_BOOKMARK)
                        continue;

                ret = curr->func(curr, mode, wake_flags, key);  //执行相应的唤醒函数,进程阻塞和使用epoll执行的这个函数不同。决定是唤醒进程还是,只是将描述符加入就绪队列。
                if (ret < 0)
                        break;
                if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)  //决定唤醒的进程数量,防止兽群效应。
                        break;

                if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) &&
                                (&next->entry != &wq_head->head)) {
                        bookmark->flags = WQ_FLAG_BOOKMARK;
                        list_add_tail(&bookmark->entry, &next->entry);
                        break;
                }
        }
        return nr_exclusive;
}

eventfd read函数

static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
                            loff_t *ppos)
{
        struct eventfd_ctx *ctx = file->private_data;
        ssize_t res;
        __u64 cnt;

        if (count < sizeof(cnt))
                return -EINVAL;
        res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);//关键函数,读取count至
        if (res < 0)
                return res;

        return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
}

ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
{       
        ssize_t res;
        DECLARE_WAITQUEUE(wait, current);

        spin_lock_irq(&ctx->wqh.lock);
        *cnt = 0;
        res = -EAGAIN; 
        if (ctx->count > 0)
                res = 0;
        else if (!no_wait) {            //阻塞模式,则执行阻塞代码
                __add_wait_queue(&ctx->wqh, &wait);             //添加到等待队列
                for (;;) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        if (ctx->count > 0) {
                                res = 0;
                                break;
                        }
                        if (signal_pending(current)) {
                                res = -ERESTARTSYS;
                                break;
                        }
                        spin_unlock_irq(&ctx->wqh.lock);
                        schedule();
                        spin_lock_irq(&ctx->wqh.lock);
                }
                __remove_wait_queue(&ctx->wqh, &wait);
                __set_current_state(TASK_RUNNING);
        }
        if (likely(res == 0)) {
                eventfd_ctx_do_read(ctx, cnt);      //读取ctx->count数据,怎么读取有flags控制。
                if (waitqueue_active(&ctx->wqh))
                        wake_up_locked_poll(&ctx->wqh, POLLOUT);    //跟write一样,唤醒等待的进程
        }
        spin_unlock_irq(&ctx->wqh.lock);

        return res;
}
static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
{       
        *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;   //根据设置的标志位,决定每次读一个还是全部读取
        ctx->count -= *cnt;
}

eventfd中的poll函数。
当前我发现poll函数在两个地方有执行。
fs/select.c文件中的do_select函数中执行,这个函数遍历文件描述符列表对应的文件的poll函数检查是否有事件发生。如果有事件发生则返回。
fs/eventpoll.c文件中ep_ctl在执行ep_insert时候执行,用来注册等待队列,当有事件发生,唤起等待队列时,可以执行fs/eventpoll.c中的一个函数,将文件描述符放到就绪列表中。

static unsigned int eventfd_poll(struct file *file, poll_table *wait)
{       
        struct eventfd_ctx *ctx = file->private_data;
        unsigned int events = 0;
        u64 count;

        poll_wait(file, &ctx->wqh, wait);           //在epoll中有用。在ep_insert时将poll_table挂在等待队列中

/*下面的代码是select时有用。select 遍历每个文件描述符,调用对应poll查询是否有事件发生*/
        count = READ_ONCE(ctx->count);

        if (count > 0) 
                events |= POLLIN;
        if (count == ULLONG_MAX)
                events |= POLLERR;
        if (ULLONG_MAX - 1 > count)
                events |= POLLOUT;

        return events;
}

eventfd 相当于进程间的一个通讯机制,能交互的数据时一个整数。更像是一个事件通知机制。所以virtio的vhost会用这个来做数据包通知。

分类
未分类

vxlan 虚拟网卡

在linux中vxlan有两种实现,一种是 linux 内核实现 一种是 ovs kernel datapath中的实现, ovs kernel datapath的实现是基于 linux 内核实现,二次封装。
vxlan 协议是外层用udp封装以太数据包。
vxlan在发送和接收数据的时候需要解决一下几个问题
1.发送时,如何知道特定vni 的内层数据包的目的ip的mac地址
看内核代码发现,内核不处理这个问题,内核只是保存一个缓存区,记录vni inner-ip mac out-ip四元组
2.外层udp是点对点单播,如何通过内层数据包的目的ip获取外层udp的目的ip
看内核代码发现,内核也不处理这个问题,内核只是保存一个缓冲器,记录 vni inner-ip mac out-ip四元组

在解决上面说的问题后,vxlan网卡在发包时。根据上面的信息封装数据包,并通过udp发包函数将数据包发送出去。

3.收包时,如何将数据包传递给对应的vxlan网卡。
再打开vxlan网卡时,vxlan网卡会根据配置,在内核创建udp sock。然后将udp sock的类型设置为隧道类型。当对应的udp sock 收到 数据包时,会调用 vxlan模块中对应的接收函数,整个过程都在中断中进行。直至将数据包挂在vxlan设备的收包队列中。
4.如何区分是哪个vxlan网卡。
接收到数据包以后,根据内层数据包的vni 获取对应的vxlan网卡

代码 就不贴了, 感觉代码挺简单的。 关键是知道数据流程

分类
未分类

ipvtap 网卡设备

跟macvtap设备类似 ipvtap设备 是将ipvlan设备和tap字符设备结合一起。
ipvtap设备的初始化方法跟macvtap一摸一样,只是在创建网卡设备时macvtap创建macvlan设备 ipvtap设备创建ipvlan设备
划重点:
在ipvtap设备中,看代码应该 不能创建 IPVLAN_MODE_L3S类型的ipvlan设备,因为IPVLAN_MODE_L3S类型的设备不会在netif_receive_skb函数中收包,再后面协议栈中修改dev 不会重新执行netif_receive_skb函数,跟踪代码发现无法上送skb到用户态。
代码就不做详细分析了,跟macvtap 思路一致,不想做重复性的工作

分类
未分类

ipvlan 虚拟 网卡

ipvlan虚拟网卡类似于macvlan虚拟网卡。只是macvlan虚拟网卡 每个虚拟网卡都有自己的mac地址,而 ipvlan虚拟网卡所有的网卡共用一个mac地址,通过不同的ip地址来区分数据包属于哪个网卡。有点类似于内核之前的别名接口。但是它跟别名接口最大的区别在于由于存在虚拟网卡设备,可以将不同的虚拟网卡分配到不同的命令空间,后续我们可以看到它还能够跟tap设备结合,组成 ipvtap虚拟网卡。
类似于 macvlan设备,ipvlan设备也有3中模式
L2模式,在这种模式下,ipvlan虚拟网卡能够收到广播报文,能够自己处理arp请求
L3模式,在这种模式下,ipvlan虚拟网卡不能够接受二层广播报文,arp请求由主网卡代为处理
L3S模式,在这种模式下,ipvlan虚拟网卡不能够接受二层广播报文,arp请求由主网卡代为处理。跟L3模式唯一的区别在于,L3s模式只有在报文是发往本地的时候才修改接收数据包的网卡,否则不修改,这种模式没法和tap设备结合

ipvlan设备的关键数据结构是ipvl_port结构,每个被寄生的物理设备都会有一个这样的设备,所有寄生设备都会被连接到这个结构中

struct ipvl_port {
        struct net_device       *dev;   //寄生的物理网卡
        possible_net_t          pnet;   //命令空间
        struct hlist_head       hlhead[IPVLAN_HASH_SIZE];   //根据地址查找ipvlan设备
        struct list_head        ipvlans;    //所有的ipvlan设备
        u16                     mode;
        u16                     dev_id_start;
        struct work_struct      wq;     //广播地址发送工作进程
        struct sk_buff_head     backlog;    //广播包缓冲队列
        int                     count;  //寄生的ipvlan设备数量
        struct ida              ida;
};

每个ipvlan设备私有数据结构

struct ipvl_dev {
        struct net_device       *dev;   //ipvlan设备网卡
        struct list_head        pnode;  //链表头
        struct ipvl_port        *port;  //属于哪个port
        struct net_device       *phy_dev;   //寄生的物理网卡设备
        struct list_head        addrs;  //这个ipvlan设备对应的ip地址列表
        struct ipvl_pcpu_stats  __percpu *pcpu_stats;   //统计技术
        DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE);
        netdev_features_t       sfeatures;
        u32                     msg_enable;
};

模块注册

static int __init ipvlan_init_module(void)
{
        int err;

        ipvlan_init_secret();
        register_netdevice_notifier(&ipvlan_notifier_block);    //注册网卡启停事件
        register_inet6addr_notifier(&ipvlan_addr6_notifier_block);  //注册ipv6地址加减事件
        register_inet6addr_validator_notifier(
            &ipvlan_addr6_vtor_notifier_block); //注册ipv6地址验证事件
        register_inetaddr_notifier(&ipvlan_addr4_notifier_block);   //注册ipv4地址加减事件
        register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block);    //注册ipv4地址验证事件

        err = register_pernet_subsys(&ipvlan_net_ops);  //注册网络命名空间退出和初始化结构
        if (err < 0)
                goto error;

        err = ipvlan_link_register(&ipvlan_link_ops);   //注册netlink创建网卡结构
        if (err < 0) {
                unregister_pernet_subsys(&ipvlan_net_ops);
                goto error;
        }

        return 0;
error:
        unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
        unregister_inetaddr_validator_notifier(
            &ipvlan_addr4_vtor_notifier_block);
        unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
        unregister_inet6addr_validator_notifier(
            &ipvlan_addr6_vtor_notifier_block);
        unregister_netdevice_notifier(&ipvlan_notifier_block);
        return err;
}

由于ipvlan需要根据ip地址分流,因此当虚拟网卡添加或者删除ip时,都需要在分流hash表中做相应的操作。因此需要注册 ip地址变化事件处理函数

int ipvlan_link_new(struct net *src_net, struct net_device *dev,
                    struct nlattr *tb[], struct nlattr *data[],
                    struct netlink_ext_ack *extack)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ipvl_port *port;
        struct net_device *phy_dev;
        int err;
        u16 mode = IPVLAN_MODE_L3;
        bool create = false;

        if (!tb[IFLA_LINK])
                return -EINVAL;

        phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));  //获取寄生的物理设备
        if (!phy_dev)
                return -ENODEV;

        if (netif_is_ipvlan(phy_dev)) { //如果嵌套获取最底层的物理设备
                struct ipvl_dev *tmp = netdev_priv(phy_dev);

                phy_dev = tmp->phy_dev;
        } else if (!netif_is_ipvlan_port(phy_dev)) {//如果物理设备还没被寄生过创建port
                err = ipvlan_port_create(phy_dev);
                if (err < 0)
                        return err;
                create = true;
        }

        if (data && data[IFLA_IPVLAN_MODE])
                mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); //获取创建的ipvlan设备类型

        port = ipvlan_port_get_rtnl(phy_dev);
        ipvlan->phy_dev = phy_dev;
        ipvlan->dev = dev;
        ipvlan->port = port;
        ipvlan->sfeatures = IPVLAN_FEATURES;
        ipvlan_adjust_mtu(ipvlan, phy_dev);
        INIT_LIST_HEAD(&ipvlan->addrs);

        /* If the port-id base is at the MAX value, then wrap it around and
         * begin from 0x1 again. This may be due to a busy system where lots
         * of slaves are getting created and deleted.
         */
        if (port->dev_id_start == 0xFFFE)
                port->dev_id_start = 0x1;

        /* Since L2 address is shared among all IPvlan slaves including
         * master, use unique 16 bit dev-ids to diffentiate among them.
         * Assign IDs between 0x1 and 0xFFFE (used by the master) to each
         * slave link [see addrconf_ifid_eui48()].
         */
        err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE,
                             GFP_KERNEL); 
        if (err < 0)
                err = ida_simple_get(&port->ida, 0x1, port->dev_id_start,
                                     GFP_KERNEL);
        if (err < 0)
                goto destroy_ipvlan_port;
        dev->dev_id = err;
        /* Increment id-base to the next slot for the future assignment */
        port->dev_id_start = err + 1;

        /* TODO Probably put random address here to be presented to the
         * world but keep using the physical-dev address for the outgoing
         * packets.
         */
        memcpy(dev->dev_addr, phy_dev->dev_addr, ETH_ALEN); //设置mac地址

        dev->priv_flags |= IFF_IPVLAN_SLAVE;

        err = register_netdevice(dev);
        if (err < 0)
                goto remove_ida;

        err = netdev_upper_dev_link(phy_dev, dev);
        if (err) {
                goto unregister_netdev;
        }
        err = ipvlan_set_port_mode(port, mode); //设置ipvlan模式,所有寄居在同一个物理设备上的ipvlan设备必须是同一个模式
        if (err) {
                goto unlink_netdev;
        }

        list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);//添加ipvlan到port中
        netif_stacked_transfer_operstate(phy_dev, dev);
        return 0;

unlink_netdev:
        netdev_upper_dev_unlink(phy_dev, dev);
unregister_netdev:
        unregister_netdevice(dev);
remove_ida:
        ida_simple_remove(&port->ida, dev->dev_id);
destroy_ipvlan_port:
        if (create)
                ipvlan_port_destroy(phy_dev);
        return err;
}

打开设备 函数

static int ipvlan_open(struct net_device *dev)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct net_device *phy_dev = ipvlan->phy_dev;
        struct ipvl_addr *addr;

        if (ipvlan->port->mode == IPVLAN_MODE_L3 ||
            ipvlan->port->mode == IPVLAN_MODE_L3S)
                dev->flags |= IFF_NOARP;
        else
                dev->flags &= ~IFF_NOARP;   //只L2模式下 需要回复 arp报文

        list_for_each_entry(addr, &ipvlan->addrs, anode)
                ipvlan_ht_addr_add(ipvlan, addr);   //将这个设备关联的ip地址添加到查找hash表中。

        return dev_uc_add(phy_dev, phy_dev->dev_addr);
}

数据流 接受数据包

rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
{       
        struct sk_buff *skb = *pskb;
        struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev);

        if (!port)
                return RX_HANDLER_PASS;

        switch (port->mode) {
        case IPVLAN_MODE_L2:
                return ipvlan_handle_mode_l2(pskb, port);   //L2 模式收包函数
        case IPVLAN_MODE_L3:
                return ipvlan_handle_mode_l3(pskb, port);   //L3模式收包函数
        case IPVLAN_MODE_L3S:
                return RX_HANDLER_PASS;     //L3s模式,在走协议栈时不改变dev,只有在最后local_in链表中改变
        }

        /* Should not reach here */
        WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n",
                          port->mode);
        kfree_skb(skb);
        return RX_HANDLER_CONSUMED;
} 
static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb,
                                                 struct ipvl_port *port)
{               
        struct sk_buff *skb = *pskb;
        struct ethhdr *eth = eth_hdr(skb);
        rx_handler_result_t ret = RX_HANDLER_PASS;
        void *lyr3h;
        int addr_type;

        if (is_multicast_ether_addr(eth->h_dest)) {     //广播地址处理流程
                if (ipvlan_external_frame(skb, port)) {
                        struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

                        /* External frames are queued for device local
                         * distribution, but a copy is given to master
                         * straight away to avoid sending duplicates later
                         * when work-queue processes this frame. This is
                         * achieved by returning RX_HANDLER_PASS.
                         */
                        if (nskb) {
                                ipvlan_skb_crossing_ns(nskb, NULL);
                                ipvlan_multicast_enqueue(port, nskb, false);//添加到广播数据包队里中,等待广播地址处理工作队列处理。
                        }
                }
        } else {
                struct ipvl_addr *addr;

                lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
                if (!lyr3h)
                        return ret;

                addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);    //在ip地址hash表中通过地址找到收包设备,
                if (addr)
                        ret = ipvlan_rcv_frame(addr, pskb, false);  //修改数据包的收包设备为找到的ipvlan设备 返回RX_HANDLER_ANOTHER
        }

        return ret;
}
static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb,
                                                 struct ipvl_port *port)
{       
        void *lyr3h;
        int addr_type;
        struct ipvl_addr *addr;
        struct sk_buff *skb = *pskb;
        rx_handler_result_t ret = RX_HANDLER_PASS;

        lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
        if (!lyr3h)
                goto out;

        addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);    //通过地址找到收包设备
        if (addr)
                ret = ipvlan_rcv_frame(addr, pskb, false);  //改变数据包的收包设备,然后返回RX_HANDLER_ANOTHER

out:    
        return ret;
}

数据流 发包流程

static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb,
                                     struct net_device *dev)
{
        const struct ipvl_dev *ipvlan = netdev_priv(dev);
        int skblen = skb->len;
        int ret;

        ret = ipvlan_queue_xmit(skb, dev);  //发送数据包
        if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
                struct ipvl_pcpu_stats *pcptr;

                pcptr = this_cpu_ptr(ipvlan->pcpu_stats);

                u64_stats_update_begin(&pcptr->syncp);
                pcptr->tx_pkts++;
                pcptr->tx_bytes += skblen;
                u64_stats_update_end(&pcptr->syncp);
        } else {
                this_cpu_inc(ipvlan->pcpu_stats->tx_drps);
        }
        return ret;
}
int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{                                    
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ipvl_port *port = ipvlan_port_get_rcu_bh(ipvlan->phy_dev);

        if (!port)
                goto out;

        if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
                goto out;

        switch(port->mode) {
        case IPVLAN_MODE_L2:
                return ipvlan_xmit_mode_l2(skb, dev);   //L2模式发包流程
        case IPVLAN_MODE_L3:
        case IPVLAN_MODE_L3S:
                return ipvlan_xmit_mode_l3(skb, dev);   //L3模式发包流程
        } 

        /* Should not reach here */
        WARN_ONCE(true, "ipvlan_queue_xmit() called for mode = [%hx]\n",
                          port->mode);
out:
        kfree_skb(skb);
        return NET_XMIT_DROP;                
}
static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
{
        const struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ethhdr *eth = eth_hdr(skb);
        struct ipvl_addr *addr;
        void *lyr3h;
        int addr_type;

        if (ether_addr_equal(eth->h_dest, eth->h_source)) { //发送给自己的数据包
                lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
                if (lyr3h) {
                        addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
                        if (addr)
                                return ipvlan_rcv_frame(addr, &skb, true);  //直接通过ipvlan设备上送
                }
                skb = skb_share_check(skb, GFP_ATOMIC);
                if (!skb)
                        return NET_XMIT_DROP;

                /* Packet definitely does not belong to any of the
                 * virtual devices, but the dest is local. So forward
                 * the skb for the main-dev. At the RX side we just return
                 * RX_PASS for it to be processed further on the stack.
                 */
                return dev_forward_skb(ipvlan->phy_dev, skb);   //通过物理设备上送

        } else if (is_multicast_ether_addr(eth->h_dest)) {  //广播报文
                ipvlan_skb_crossing_ns(skb, NULL);
                ipvlan_multicast_enqueue(ipvlan->port, skb, true);  //添加到广播报文缓冲队列中
                return NET_XMIT_SUCCESS;
        }

        ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
        return dev_queue_xmit(skb); //通过物理设备发送出去
}
static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
{       
        const struct ipvl_dev *ipvlan = netdev_priv(dev);
        void *lyr3h;
        struct ipvl_addr *addr;
        int addr_type;

        lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
        if (!lyr3h)
                goto out;

        addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
        if (addr)
                return ipvlan_rcv_frame(addr, &skb, true);  //通过ipvlan设备发送

out:    
        ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
        return ipvlan_process_outbound(skb);    //通过查找路由,发送数据包
}
static int ipvlan_process_outbound(struct sk_buff *skb)
{
        struct ethhdr *ethh = eth_hdr(skb);
        int ret = NET_XMIT_DROP;

        /* In this mode we dont care about multicast and broadcast traffic */
        if (is_multicast_ether_addr(ethh->h_dest)) {    //广播地址直接丢弃,不处理广播
                pr_warn_ratelimited("Dropped {multi|broad}cast of type= [%x]\n",
                                    ntohs(skb->protocol));
                kfree_skb(skb);
                goto out;
        }

        /* The ipvlan is a pseudo-L2 device, so the packets that we receive
         * will have L2; which need to discarded and processed further
         * in the net-ns of the main-device.
         */
        if (skb_mac_header_was_set(skb)) {
                skb_pull(skb, sizeof(*ethh));
                skb->mac_header = (typeof(skb->mac_header))~0U;
                skb_reset_network_header(skb);
        }

        if (skb->protocol == htons(ETH_P_IPV6))
                ret = ipvlan_process_v6_outbound(skb);  //通过ipv6的outbound发送
        else if (skb->protocol == htons(ETH_P_IP))
                ret = ipvlan_process_v4_outbound(skb);  //通过ipv4的outbound发送
        else {
                pr_warn_ratelimited("Dropped outbound packet type=%x\n",
                                    ntohs(skb->protocol));
                kfree_skb(skb);
        }
out:
        return ret;
}
static int ipvlan_process_v4_outbound(struct sk_buff *skb)
{
        const struct iphdr *ip4h = ip_hdr(skb);
        struct net_device *dev = skb->dev;
        struct net *net = dev_net(dev);
        struct rtable *rt;
        int err, ret = NET_XMIT_DROP;
        struct flowi4 fl4 = {
                .flowi4_oif = dev->ifindex,
                .flowi4_tos = RT_TOS(ip4h->tos),
                .flowi4_flags = FLOWI_FLAG_ANYSRC,
                .daddr = ip4h->daddr,
                .saddr = ip4h->saddr,
        };

        rt = ip_route_output_flow(net, &fl4, NULL); //查找路由
        if (IS_ERR(rt))
                goto err;

        if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
                ip_rt_put(rt);
                goto err;
        }
        skb_dst_set(skb, &rt->dst);
        err = ip_local_out(net, skb->sk, skb);  //通过ip_local_out发送数据包,走路由流程发送
        if (unlikely(net_xmit_eval(err)))
                dev->stats.tx_errors++;
        else
                ret = NET_XMIT_SUCCESS;
        goto out;
err:
        dev->stats.tx_errors++;
        kfree_skb(skb);
out:
        return ret;
}

ipvlan设备相当于macvlan设备的扩充,L3模式在发送数据包时 不一定会走ipvlan宿主设备,可能通过路由走其他设备发送出去。

分类
未分类

macvtap 网卡

  • macvtap设备和macvlan设备的区别
    macvtap设备是基于macvlan设备的。即每次创建一个macvtap设备必定会创建一个macvlan设备
    macvtap设备和macvlan设备最大的区别是数据包的来源和去向
    macvtap设备会关联一个字符设备,通过macvlan设备收上来的数据包直接扔到 字符设备的缓冲区里面,等待字符设备读取。字符设备发送下来的包直接通过关联的macvlan设备发送出去。
    macvtap 设备收到的数据包直接上送到用户态,用户态程序通过macvtap设备关联的字符设备直接发送数据包。
    macvlan 设备收到的数据包上送到协议栈处理。然后协议栈通过发包接口调用macvlan设备发送数据包
  • macvtap设备和tun/tap设备的区别
    macvtap设备是将macvlan网卡设备收到的包通过字符设备上送到用户态,用户态写字符设备通过macvtap设备将数据从关联的macvlan设备发送出去。
    tun/tap设备协议栈通过tun/tap设备的虚拟网卡,将发送的数据包通过字符设备上送到用户态,用户态写字符设备通过tun/tap设备的虚拟网卡,将数据包上送到协议栈。
    总结起来就是,tun/tap设备将虚拟网卡发送的数据包上送到用户态, macvtap设备将虚拟macvlan网卡接受的数据包上送到用户态。tun/tap设备将用户下发的数据包上送到协议栈,macvtap设备将用户态下发的数据包,通过虚拟macvlan设备发送出去。

私有数据结构,这个结构会被当做创建的macvlan网卡设备的私有数据。

struct macvtap_dev {
        struct macvlan_dev vlan; //macvlan设备的私有结构 上结讨论过
        struct tap_dev    tap;   //macvtap的私有结构。
};
struct tap_dev {
        struct net_device       *dev;
        u16                     flags;
        /* This array tracks active taps. */
        struct tap_queue    __rcu *taps[MAX_TAP_QUEUES];    //收包文件队列
        /* This list tracks all taps (both enabled and disabled) */
        struct list_head        queue_list;
        int                     numvtaps;
        int                     numqueues;
        netdev_features_t       tap_features;
        int                     minor;

        void (*update_features)(struct tap_dev *tap, netdev_features_t features);
        void (*count_tx_dropped)(struct tap_dev *tap);
        void (*count_rx_dropped)(struct tap_dev *tap);
};
struct tap_queue {
        struct sock sk;
        struct socket sock;
        struct socket_wq wq;
        int vnet_hdr_sz;
        struct tap_dev __rcu *tap;
        struct file *file;
        unsigned int flags;
        u16 queue_index;
        bool enabled;
        struct list_head next;
        struct skb_array skb_array; //数据包缓冲区
};

模块注册

static int macvtap_init(void)
{
        int err;

        err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap");//注册字符设备需要使用的 major 和minor
        if (err)
                goto out1;

        err = class_register(&macvtap_class);//在sys文件系统中注册macvtap class
        if (err)
                goto out2;

        err = register_netdevice_notifier(&macvtap_notifier_block);//注册网卡设备事件消息回调函数
        if (err)
                goto out3;

        err = macvlan_link_register(&macvtap_link_ops);//注册macvtap netlink 结构,供ip link 调用创建macvtap设备
        if (err)
                goto out4;

        return 0;

out4:
        unregister_netdevice_notifier(&macvtap_notifier_block);
out3:
        class_unregister(&macvtap_class);
out2:
        tap_destroy_cdev(macvtap_major, &macvtap_cdev);
out1:
        return err;
}

macvtap设备创建

static void macvtap_setup(struct net_device *dev)
{
        macvlan_common_setup(dev); //调用macvlan初始化,网卡设备
        dev->tx_queue_len = TUN_READQ_SIZE;
}
static int macvtap_newlink(struct net *src_net, struct net_device *dev,
                           struct nlattr *tb[], struct nlattr *data[],
                           struct netlink_ext_ack *extack)
{
        struct macvtap_dev *vlantap = netdev_priv(dev);
        int err;

        INIT_LIST_HEAD(&vlantap->tap.queue_list);

        /* Since macvlan supports all offloads by default, make
         * tap support all offloads also.
         */
        vlantap->tap.tap_features = TUN_OFFLOADS;

        /* Register callbacks for rx/tx drops accounting and updating
         * net_device features
         */
        vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped;
        vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped;
        vlantap->tap.update_features  = macvtap_update_features;

        err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);//macvtap设备rx_handler函数
        if (err)
                return err;

        /* Don't put anything that may fail after macvlan_common_newlink
         * because we can't undo what it does.
         */
        err = macvlan_common_newlink(src_net, dev, tb, data);//调用macvlan注册新网卡设备
        if (err) {
                netdev_rx_handler_unregister(dev);
                return err;
        }

        vlantap->tap.dev = vlantap->vlan.dev;

        return 0;
}

网卡注册时会触发 网卡注册事件,最后回调macvtap_device_event函数。

static int macvtap_device_event(struct notifier_block *unused,
                                unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct macvtap_dev *vlantap;
        struct device *classdev;
        dev_t devt;
        int err;
        char tap_name[IFNAMSIZ];

        if (dev->rtnl_link_ops != &macvtap_link_ops) //如果不是macvtap网卡设备注册不管
                return NOTIFY_DONE;

        snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
        vlantap = netdev_priv(dev);

        switch (event) {
        case NETDEV_REGISTER:
                /* Create the device node here after the network device has
                 * been registered but before register_netdevice has
                 * finished running.
                 */
                err = tap_get_minor(macvtap_major, &vlantap->tap);//获取即将创建的字符设备的minor号,并将字符设备和网卡设备给关联
                if (err)
                        return notifier_from_errno(err);

                devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
                classdev = device_create(&macvtap_class, &dev->dev, devt,
                                         dev, tap_name);//创建macvtap对应的字符设备
                if (IS_ERR(classdev)) {
                        tap_free_minor(macvtap_major, &vlantap->tap);
                        return notifier_from_errno(PTR_ERR(classdev));
                }
                err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
                                        tap_name);
                if (err)
                        return notifier_from_errno(err);
                break;
        case NETDEV_UNREGISTER:
                /* vlan->minor == 0 if NETDEV_REGISTER above failed */
                if (vlantap->tap.minor == 0)
                        break;
                sysfs_remove_link(&dev->dev.kobj, tap_name);
                devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
                device_destroy(&macvtap_class, devt);
                tap_free_minor(macvtap_major, &vlantap->tap);
                break;
        case NETDEV_CHANGE_TX_QUEUE_LEN:
                if (tap_queue_resize(&vlantap->tap))
                        return NOTIFY_BAD;
                break;
        }

        return NOTIFY_DONE;
}

打开字符设备,用于用户态通讯

static int tap_open(struct inode *inode, struct file *file)
{
        struct net *net = current->nsproxy->net_ns;
        struct tap_dev *tap;
        struct tap_queue *q;
        int err = -ENODEV;

        rtnl_lock();
        tap = dev_get_by_tap_file(imajor(inode), iminor(inode)); //通过major和minor号获取对应的tap
        if (!tap)
                goto err;

        err = -ENOMEM;
        q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
                                             &tap_proto, 0); //创建tap_queue每次打开字符设备多一个队列
        if (!q)
                goto err;

        RCU_INIT_POINTER(q->sock.wq, &q->wq);
        init_waitqueue_head(&q->wq.wait);
        q->sock.type = SOCK_RAW;
        q->sock.state = SS_CONNECTED;
        q->sock.file = file;
        q->sock.ops = &tap_socket_ops;
        sock_init_data(&q->sock, &q->sk);
        q->sk.sk_write_space = tap_sock_write_space;
        q->sk.sk_destruct = tap_sock_destruct;
        q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
        q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);

        /*
         * so far only KVM virtio_net uses tap, enable zero copy between
         * guest kernel and host kernel when lower device supports zerocopy
         *
         * The macvlan supports zerocopy iff the lower device supports zero
         * copy so we don't have to look at the lower device directly.
         */
        if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG))
                sock_set_flag(&q->sk, SOCK_ZEROCOPY);

        err = -ENOMEM;
        if (skb_array_init(&q->skb_array, tap->dev->tx_queue_len, GFP_KERNEL))
                goto err_array;

        err = tap_set_queue(tap, file, q); //将队列和tap设备关联。
        if (err)
                goto err_queue;

        dev_put(tap->dev);

        rtnl_unlock();
        return err;

err_queue:
        skb_array_cleanup(&q->skb_array);
err_array:
        sock_put(&q->sk);
err:
        if (tap)
                dev_put(tap->dev);

        rtnl_unlock();
        return err;
}

至此,数据通路已经完全打开。
接受数据流程
在netif_receive_skb函数中调用 tap_handle_frame函数

rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
{
        struct sk_buff *skb = *pskb;
        struct net_device *dev = skb->dev;
        struct tap_dev *tap;
        struct tap_queue *q;
        netdev_features_t features = TAP_FEATURES;

        tap = tap_dev_get_rcu(dev);//获取设备关联的tap结构
        if (!tap)
                return RX_HANDLER_PASS;

        q = tap_get_queue(tap, skb);//根据hash计算使用哪个缓冲队列
        if (!q)
                return RX_HANDLER_PASS;

        if (__skb_array_full(&q->skb_array))
                goto drop;

        skb_push(skb, ETH_HLEN);

        /* Apply the forward feature mask so that we perform segmentation
         * according to users wishes.  This only works if VNET_HDR is
         * enabled.
         */
        if (q->flags & IFF_VNET_HDR)
                features |= tap->tap_features;
        if (netif_needs_gso(skb, features)) {
                struct sk_buff *segs = __skb_gso_segment(skb, features, false);

                if (IS_ERR(segs))
                        goto drop;

                if (!segs) {
                        if (skb_array_produce(&q->skb_array, skb)) //将数据放入缓冲队列中
                                goto drop;
                        goto wake_up;
                }

                consume_skb(skb);
                while (segs) {
                        struct sk_buff *nskb = segs->next;

                        segs->next = NULL;
                        if (skb_array_produce(&q->skb_array, segs)) {
                                kfree_skb(segs);
                                kfree_skb_list(nskb);
                                break;
                        }
                        segs = nskb;
                }
        } else {
                /* If we receive a partial checksum and the tap side
                 * doesn't support checksum offload, compute the checksum.
                 * Note: it doesn't matter which checksum feature to
                 *        check, we either support them all or none.
                 */
                if (skb->ip_summed == CHECKSUM_PARTIAL &&
                    !(features & NETIF_F_CSUM_MASK) &&
                    skb_checksum_help(skb))
                        goto drop;
                if (skb_array_produce(&q->skb_array, skb)) //将数据放入缓冲队列中
                        goto drop;
        }

wake_up:
        wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND); //唤醒等待读数据的进程
        return RX_HANDLER_CONSUMED;

drop:
        /* Count errors/drops only here, thus don't care about args. */
        if (tap->count_rx_dropped)
                tap->count_rx_dropped(tap);
        kfree_skb(skb);
        return RX_HANDLER_CONSUMED;
}

用户态调用tap_read_iter读取数据,tap_do_read 函数跟 tun/tap中函数类似,都是判断缓冲区是否有数据,如果没有数据就等待,如果有数据返回数据

static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct tap_queue *q = file->private_data;
        ssize_t len = iov_iter_count(to), ret;

        ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK, NULL); //从缓冲区获取数据
        ret = min_t(ssize_t, ret, len);
        if (ret > 0)
                iocb->ki_pos = ret;
        return ret;
}

发送数据流程

static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct tap_queue *q = file->private_data;

        return tap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK); 发送数据
}
static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
                            struct iov_iter *from, int noblock)
{
        int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
        struct sk_buff *skb;
        struct tap_dev *tap;
        unsigned long total_len = iov_iter_count(from);
        unsigned long len = total_len;
        int err;
        struct virtio_net_hdr vnet_hdr = { 0 };
        int vnet_hdr_len = 0;
        int copylen = 0;
        int depth;
        bool zerocopy = false;
        size_t linear;

        if (q->flags & IFF_VNET_HDR) {
                vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);

                err = -EINVAL;
                if (len < vnet_hdr_len)
                        goto err;
                len -= vnet_hdr_len;

                err = -EFAULT;
                if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
                        goto err;
                iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
                if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
                     tap16_to_cpu(q, vnet_hdr.csum_start) +
                     tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
                             tap16_to_cpu(q, vnet_hdr.hdr_len))
                        vnet_hdr.hdr_len = cpu_to_tap16(q,
                                 tap16_to_cpu(q, vnet_hdr.csum_start) +
                                 tap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
                err = -EINVAL;
                if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len)
                        goto err;
        }

        err = -EINVAL;
        if (unlikely(len < ETH_HLEN))
                goto err;

        if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
                struct iov_iter i;

                copylen = vnet_hdr.hdr_len ?
                        tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
                if (copylen > good_linear)
                        copylen = good_linear;
                else if (copylen < ETH_HLEN)
                        copylen = ETH_HLEN;
                linear = copylen;
                i = *from;
                iov_iter_advance(&i, copylen);
                if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
                        zerocopy = true;
        }

        if (!zerocopy) {
                copylen = len;
                linear = tap16_to_cpu(q, vnet_hdr.hdr_len);
                if (linear > good_linear)
                        linear = good_linear;
                else if (linear < ETH_HLEN)
                        linear = ETH_HLEN;
        }

        skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen,
                            linear, noblock, &err);//分配skb
        if (!skb)
                goto err;

        if (zerocopy)
                err = zerocopy_sg_from_iter(skb, from);
        else
                err = skb_copy_datagram_from_iter(skb, 0, from, len); //获取数据

        if (err)
                goto err_kfree;

        skb_set_network_header(skb, ETH_HLEN);
        skb_reset_mac_header(skb); //设置
        skb->protocol = eth_hdr(skb)->h_proto;

        if (vnet_hdr_len) {
                err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
                                            tap_is_little_endian(q));
                if (err)
                        goto err_kfree;
        }

        skb_probe_transport_header(skb, ETH_HLEN);

        /* Move network header to the right position for VLAN tagged packets */
        if ((skb->protocol == htons(ETH_P_8021Q) ||
             skb->protocol == htons(ETH_P_8021AD)) &&
            __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
                skb_set_network_header(skb, depth);

        rcu_read_lock();
        tap = rcu_dereference(q->tap);
        /* copy skb_ubuf_info for callback when skb has no error */
        if (zerocopy) {
                skb_shinfo(skb)->destructor_arg = m->msg_control;
                skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
                skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
        } else if (m && m->msg_control) {
                struct ubuf_info *uarg = m->msg_control;
                uarg->callback(uarg, false);
        }

        if (tap) {
                skb->dev = tap->dev; //设置发送使用的网卡
                dev_queue_xmit(skb); //发送数据包,最后调用macvlan的发送接口发送数据包
        } else {
                kfree_skb(skb);
        }
        rcu_read_unlock();

        return total_len;

err_kfree:
        kfree_skb(skb);

err:
        rcu_read_lock();
        tap = rcu_dereference(q->tap);
        if (tap && tap->count_tx_dropped)
                tap->count_tx_dropped(tap);
        rcu_read_unlock();

        return err;
}

至此, macvtap数据发送流程结束。

macvtap在vpn和 虚拟化中应用很广。它主要功能是将macvlan设备获取的数据包直接上送到用户态而不是协议栈。