分类
未分类

kprobe机制分析

kprobe是linux内核提供的一种动态调试机制,通过这套机制,用户可以在执行指定代码前,先执行自己的代码。做一些统计,跟踪等工作。
想了解kprobe模块的工作机理,可以从内核源码的sample/kprobes/kprobes_example.c文件开始。
这个文件会被编译成内核的一个模块,我们从模块加载时执行的代码开始分析。

#define MAX_SYMBOL_LEN  64
static char symbol[MAX_SYMBOL_LEN] = "_do_fork";        //默认在执行_do_fork函数之前,执行用户插入的代码
module_param_string(symbol, symbol, sizeof(symbol), 0644);

/* For each probe you need to allocate a kprobe structure */
static struct kprobe kp = {
        .symbol_name    = symbol,           //指定,在那个代码之前执行用户的代码
};

static int __init kprobe_init(void)
{
        int ret;
        kp.pre_handler = handler_pre;           //在执行当前代码之前执行的函数在例子中函数只是打印一些地址和符号,实际用户可以做很多事
        kp.post_handler = handler_post;         //在执行完当前代码执行的函数在例子中函数只是打印一些地址和符号。
        kp.fault_handler = handler_fault;       //如果出现错误执行的函数

        ret = register_kprobe(&kp);             //注册kp结构体。
        if (ret < 0) {
                pr_err("register_kprobe failed, returned %d\n", ret);
                return ret;
        }
        pr_info("Planted kprobe at %p\n", kp.addr);
        return 0;
}

register_kprobe函数注册kprobe结构到内核中,并替换内核指定位置的指令为int 3 指令,保存被替换的指令。当内核执行这个位置时,将被int 3 捕获,然后执行用户代码。
register_kprobe代码如下

int register_kprobe(struct kprobe *p)
{
        int ret;
        struct kprobe *old_p;
        struct module *probed_mod;
        kprobe_opcode_t *addr;

        /* Adjust probe address from symbol */
        addr = kprobe_addr(p);                  //这个函数主要是根据符号和偏移获取地址。
        if (IS_ERR(addr))
                return PTR_ERR(addr);
        p->addr = addr;

        ret = check_kprobe_rereg(p);            //检查是否已经注册过 ,如果注册过就直接返回 不注册了。
        if (ret)
                return ret;

        /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
        p->flags &= KPROBE_FLAG_DISABLED;
        p->nmissed = 0;
        INIT_LIST_HEAD(&p->list);

        ret = check_kprobe_address_safe(p, &probed_mod);    //检查地址是否可以被probe,如果不能也返回
        if (ret)
                return ret;

        mutex_lock(&kprobe_mutex);

        old_p = get_kprobe(p->addr);            //检查地址是否已经被其他kprobe替换过了。如果有,走register_aggr_kprobe逻辑
        if (old_p) {
                /* Since this may unoptimize old_p, locking text_mutex. */
                ret = register_aggr_kprobe(old_p, p);   //如果地址已经被kprobe,则只需要将新的kprobe挂在老的kprobe后面,顺序执行即可
                goto out;
        }

        cpus_read_lock();
        /* Prevent text modification */
        mutex_lock(&text_mutex);
        ret = prepare_kprobe(p);                //将当前指令拷贝到kprobe结构中,
        mutex_unlock(&text_mutex);
        cpus_read_unlock();
        if (ret)
                goto out;

        INIT_HLIST_NODE(&p->hlist);
        hlist_add_head_rcu(&p->hlist,
                       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);     //将kprobe结构添加到全局变量中

        if (!kprobes_all_disarmed && !kprobe_disabled(p)) {         //判断是否注册即生效,如果生效调用arm_kprobe函数,替换内核指令
                ret = arm_kprobe(p);                        //将内核代码空间指定的地址,替换成int 3 指令 在x86架构中指令为0xcc。
                if (ret) {
                        hlist_del_rcu(&p->hlist);
                        synchronize_sched();
                        goto out;
                }
        }

        /* Try to optimize kprobe */
        try_to_optimize_kprobe(p);                  //这个函数是为了后续有probe同样地址的kprobe准备的。
out:
        mutex_unlock(&kprobe_mutex);

        if (probed_mod)
                module_put(probed_mod);

        return ret;
}

当注册完成,且内核地址被替换完成,当内核运行时,运行到被替换的代码时将被int3 捕获,执行int3的代码。
int3的代码是arch/x86/kernel/traps.c文件中的do_int3函数。中间有一句话 kprobe_int3_handle(regs)。
函数中关键代码如下

          p = get_kprobe(addr);             //根据地址查找对应的kprobe结构体

          if (p) {
                  if (kprobe_running()) {
                          if (reenter_kprobe(p, regs, kcb))
                                  return 1;
                  } else {
                          set_current_kprobe(p, regs, kcb);
                          kcb->kprobe_status = KPROBE_HIT_ACTIVE;

                          /*
                           * If we have no pre-handler or it returned 0, we
                           * continue with normal processing.  If we have a
                           * pre-handler and it returned non-zero, it prepped
                           * for calling the break_handler below on re-entry
                           * for jprobe processing, so get out doing nothing
                           * more here.
                           */
                          if (!p->pre_handler || !p->pre_handler(p, regs))      //执行结构体中pre_handler函数,即例子中的handler_pre 函数
                                  setup_singlestep(p, regs, kcb, 0);
                          return 1;
                  }

由此可以看出kprobe的工作流程如下,注册时,将kprobe结构挂全局链表中,然后将想要kprobe的指令替换成int 3 指令,当执行到这个指令时,会执行do_int3函数,最终调用kprobe_int3_handle执行用户代码。
kprobe本身功能简单,但是在trace 框架下,它能发挥巨大的作用。

分类
未分类

linux 内核poll机制分析

poll和select算是姐妹篇吧。poll没有 select那么多限制。但是就代码上看,poll每次用户态和内核态传递的数据比select更多,事件机制没有显著变化。我猜poll是后来epoll机制的先导。
系统调用接口

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                int, timeout_msecs)
{
        struct timespec64 end_time, *to = NULL;
        int ret;

        if (timeout_msecs >= 0) {
                to = &end_time;
                poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
                        NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));        //超时时间设置
        }

        ret = do_sys_poll(ufds, nfds, to);              //干活函数。

        if (ret == -EINTR) {
                struct restart_block *restart_block;

                restart_block = &current->restart_block;
                restart_block->fn = do_restart_poll;
                restart_block->poll.ufds = ufds;
                restart_block->poll.nfds = nfds;

                if (timeout_msecs >= 0) {
                        restart_block->poll.tv_sec = end_time.tv_sec;
                        restart_block->poll.tv_nsec = end_time.tv_nsec;
                        restart_block->poll.has_timeout = 1;
                } else
                        restart_block->poll.has_timeout = 0;

                ret = -ERESTART_RESTARTBLOCK;
        }
        return ret;
}

do_sys_poll函数,这个函数也是预处理函数,将用户态的数据拷贝到内核,然后调用do_poll函数工作,等do_poll返回以后再将处理完的数据拷贝到用户态。比较有意思时,它采用的块处理,每个walk块一个page。当最后的块不够page时,只分配块那么长的内存。代码同样非常简单。稍微看看就懂,不列出了。

do_poll函数,这个函数,是真正干活的函数,由于是select的姐妹篇,所以在超时设置,句柄查询方式方面跟select没有区别不废话,直接上代码。

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                   struct timespec64 *end_time)
{
        poll_table* pt = &wait->pt;
        ktime_t expire, *to = NULL;
        int timed_out = 0, count = 0;
        u64 slack = 0;
        unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;   //是否忙查
        unsigned long busy_start = 0;

        /* Optimise the no-wait case */
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                pt->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);     //超时处理

        for (;;) {      //外层循环,退出机制跟select一样,超时,文件句柄有事件,有信号
                struct poll_list *walk;
                bool can_busy_loop = false;

                for (walk = list; walk != NULL; walk = walk->next) {    //第二层训话,按句柄块处理
                        struct pollfd * pfd, * pfd_end;

                        pfd = walk->entries;
                        pfd_end = pfd + walk->len;
                        for (; pfd != pfd_end; pfd++) {     //单个句柄处理
                                /*
                                 * Fish for events. If we found one, record it
                                 * and kill poll_table->_qproc, so we don't
                                 * needlessly register any other waiters after
                                 * this. They'll get immediately deregistered
                                 * when we break out and return.
                                 */
                                if (do_pollfd(pfd, pt, &can_busy_loop,              //查询句柄是否有事件的函数
                                              busy_flag)) {
                                        count++;
                                        pt->_qproc = NULL;
                                        /* found something, stop busy polling */
                                        busy_flag = 0;
                                        can_busy_loop = false;
                                }
                        }
                }
                /*
                 * All waiters have already been registered, so don't provide
                 * a poll_table->_qproc to them on the next loop iteration.
                 */
                pt->_qproc = NULL;
                if (!count) {
                        count = wait->error;
                        if (signal_pending(current))
                                count = -EINTR;
                }
                if (count || timed_out)
                        break;

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) { //跟select一致
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))    //跟select一致。
                        timed_out = 1;
        }
        return count;
}

do_pollfd函数。查询单个文件句柄是否有事件,上代码。

static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
                                     bool *can_busy_poll,
                                     unsigned int busy_flag)
{
        unsigned int mask;
        int fd;

        mask = 0;
        fd = pollfd->fd;            //获取文件句柄
        if (fd >= 0) {
                struct fd f = fdget(fd);    //获取句柄对应的文件
                mask = POLLNVAL;
                if (f.file) {
                        mask = DEFAULT_POLLMASK;
                        if (f.file->f_op->poll) {
                                pwait->_key = pollfd->events|POLLERR|POLLHUP;
                                pwait->_key |= busy_flag;
                                mask = f.file->f_op->poll(f.file, pwait);       //调用文件对应的poll函数查询状态
                                if (mask & busy_flag)
                                        *can_busy_poll = true;
                        }
                        /* Mask out unneeded events. */
                        mask &= pollfd->events | POLLERR | POLLHUP;         //出去不需要的状态,查询需要的状态。
                        fdput(f);
                }
        }
        pollfd->revents = mask;             //保存查询的状态

        return mask;        //返回查询的状态
}

poll函数比select函数稍微改进是,poll函数没有文件句柄数量的限制,且poll函数也不会只能监听奇葩的句柄号是1024以下的文件。但是poll并不完美,每次调用poll函数都需要将所有的文件句柄下传到内核态。即使这次监听只有一个文件有事件,下次监听时也需要将所有文件句柄都下传,而且不只是下传哦,当有结果的时候内核还要上传的哦,即使只有一个文件有事件,也需要把所有状态都上传。当句柄足够多的时候,poll函数本身非常消耗cpu。因为它在内核时需要挨个轮询,当返回用户态以后还需要挨个轮询,两遍轮询存在性能瓶颈。时效性也有问题,如果出现事件的句柄在下传句柄的最后,则有可能消息处理不及时。鉴于poll函数有如此多的缺陷。开源社区的老大们开发了epoll系统调用。epoll几乎完美的解决了上述问题。因此在高负载情况下,大多数都是用epoll系统调用。

分类
未分类

linux 内核select机制分析

刚开始接触select函数的时候,没有觉得 这个机制有多厉害,自从上次看了quagga源代码以后,第一次感觉到select 这么厉害,居然能够用单线程非常好的处理多源头消息(定时任务,网络消息,进程通讯消息等)。当然,唯一的要求就是 单次处理的消息时间不能太长,消息不能非常频繁,消息对时效性要求不高。当程序满足以上条件时,即可使用select加单线程来处理。既减少了线程同步的麻烦又能同时处理多种类型的消息。
系统调用接口分析

SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, //n所有句柄中最大值,最大不能超过1024,
                fd_set __user *, exp, struct timeval __user *, tvp) //inp监控读句柄,outp监控写句柄,exp监控异常句柄,tvp超时时间
{
        struct timespec64 end_time, *to = NULL;
        struct timeval tv;
        int ret;

        if (tvp) {
                if (copy_from_user(&tv, tvp, sizeof(tv)))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to,
                                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
                        return -EINVAL;
        }

        ret = core_sys_select(n, inp, outp, exp, to);       //调用系统调用,并将由消息的句柄重新回填到inp,outp,exp内存中。
        ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);

        return ret;
}

core_sys_select函数,主要是对inp,outp,exp中的数据进行预处理,最后调用do_select函数,去轮询文件句柄对应的文件的状态。在用户态中,已经将文件句柄转换成bitmap。在内核中需要将这些bitmap拷贝到内核空间。并为select的结果分配空间。这中间的代码很简单。所谓有点意思是,在栈空间首先预分配了一个空间,只有当预分配的空间不够时,才会低啊用kvmalloc重新分配新的空间。可以看出内核在最求性能方面。一直在最求极致。最后当do_select函数执行完成。core_sys_select函数将do_select函数执行的结果。拷贝到用户态的内存中。返回给用户,同样使用的也是bitmap表示,有消息的句柄对应的bitmap被置位。
do_select函数分析,这个函数是,整个系统调用的核心,废话不多说,直接上代码

static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
        ktime_t expire, *to = NULL;
        struct poll_wqueues table;
        poll_table *wait;
        int retval, i, timed_out = 0;
        u64 slack = 0;
        unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;   //判断是否一直轮询,如果一直轮序则不休息。
        unsigned long busy_start = 0;

        rcu_read_lock();
        retval = max_select_fd(n, fds);
        rcu_read_unlock();

        if (retval < 0)
                return retval;
        n = retval;

        poll_initwait(&table);
        wait = &table.pt;
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                wait->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);         //如果有设置超时计算超时时间。

        retval = 0;
        for (;;) {          //一直循环,直到退出,退出体检时,超时,信号中断,文件句柄有事件
                unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
                bool can_busy_loop = false;

                inp = fds->in; outp = fds->out; exp = fds->ex;
                rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

                for (i = 0; i < n; ++rinp, ++routp, ++rexp) {       //第二层循环,按bytes 轮询
                        unsigned long in, out, ex, all_bits, bit = 1, mask, j;
                        unsigned long res_in = 0, res_out = 0, res_ex = 0;

                        in = *inp++; out = *outp++; ex = *exp++;
                        all_bits = in | out | ex;
                        if (all_bits == 0) {
                                i += BITS_PER_LONG;
                                continue;
                        }

                        for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {       //第三层循环 按bit轮询
                                struct fd f;
                                if (i >= n)
                                        break;
                                if (!(bit & all_bits))
                                        continue;
                                f = fdget(i);
                                if (f.file) {
                                        const struct file_operations *f_op;
                                        f_op = f.file->f_op;
                                        mask = DEFAULT_POLLMASK;
                                        if (f_op->poll) {
                                                wait_key_set(wait, in, out,
                                                             bit, busy_flag);
                                                mask = (*f_op->poll)(f.file, wait); //查询对应的文件句柄是否有事件,wait->_qproc为空只是查询状态。
                                        }
                                        fdput(f);
                                        if ((mask & POLLIN_SET) && (in & bit)) {    //如果有事件,设置对应的bitmap
                                                res_in |= bit;
                                                retval++;
                                                wait->_qproc = NULL;
                                        }
                                        if ((mask & POLLOUT_SET) && (out & bit)) {  //如果有事件,设置对应的bitmap
                                                res_out |= bit;
                                                retval++;
                                                wait->_qproc = NULL;
                                        }
                                        if ((mask & POLLEX_SET) && (ex & bit)) {    //如果有事件,设置对应的bitmap
                                                res_ex |= bit;
                                                retval++;
                                                wait->_qproc = NULL;
                                        }
                                        /* got something, stop busy polling */
                                        if (retval) {
                                                can_busy_loop = false;
                                                busy_flag = 0;

                                        /*
                                         * only remember a returned
                                         * POLL_BUSY_LOOP if we asked for it
                                         */
                                        } else if (busy_flag & mask)
                                                can_busy_loop = true;

                                }
                        }
                        if (res_in)
                                *rinp = res_in;     //拷贝,句柄bitmap
                        if (res_out)
                                *routp = res_out;   //拷贝,句柄bitmap
                        if (res_ex)
                                *rexp = res_ex;     //拷贝,句柄bitmap
                        cond_resched();
                }
                wait->_qproc = NULL;
                if (retval || timed_out || signal_pending(current))
                        break;
                if (table.error) {
                        retval = table.error;
                        break;
                }

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) {     //如果是一直忙等,判断是否需要放弃cpu,忙等时间过长,需要给别的进程cpu
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
                                           to, slack))  //当设置没有忙等时,用这个函数来放弃cpu。这个函数,时钟中断都可以唤醒,如果没有超时,则不设置timed_out否则设置time_out并在轮询一次。
                        timed_out = 1;
        }

        poll_freewait(&table);
        return retval;
}

该函数填充fds数据结构中的res_* 变量,表示查询到已经有事件的文件句柄
core_sys_select函数最终将结果拷贝到用户态内存空间,供用户程序使用。

注意:select系统调用中限制最大文件句柄数量为1024个,文件句柄号最大值是1024,文件句柄号超过1024就不能被select,即使只有一个句柄,但是句柄号是1025,select也不会监控它。

分类
未分类

vxlan 虚拟网卡

在linux中vxlan有两种实现,一种是 linux 内核实现 一种是 ovs kernel datapath中的实现, ovs kernel datapath的实现是基于 linux 内核实现,二次封装。
vxlan 协议是外层用udp封装以太数据包。
vxlan在发送和接收数据的时候需要解决一下几个问题
1.发送时,如何知道特定vni 的内层数据包的目的ip的mac地址
看内核代码发现,内核不处理这个问题,内核只是保存一个缓存区,记录vni inner-ip mac out-ip四元组
2.外层udp是点对点单播,如何通过内层数据包的目的ip获取外层udp的目的ip
看内核代码发现,内核也不处理这个问题,内核只是保存一个缓冲器,记录 vni inner-ip mac out-ip四元组

在解决上面说的问题后,vxlan网卡在发包时。根据上面的信息封装数据包,并通过udp发包函数将数据包发送出去。

3.收包时,如何将数据包传递给对应的vxlan网卡。
再打开vxlan网卡时,vxlan网卡会根据配置,在内核创建udp sock。然后将udp sock的类型设置为隧道类型。当对应的udp sock 收到 数据包时,会调用 vxlan模块中对应的接收函数,整个过程都在中断中进行。直至将数据包挂在vxlan设备的收包队列中。
4.如何区分是哪个vxlan网卡。
接收到数据包以后,根据内层数据包的vni 获取对应的vxlan网卡

代码 就不贴了, 感觉代码挺简单的。 关键是知道数据流程

分类
未分类

ipvtap 网卡设备

跟macvtap设备类似 ipvtap设备 是将ipvlan设备和tap字符设备结合一起。
ipvtap设备的初始化方法跟macvtap一摸一样,只是在创建网卡设备时macvtap创建macvlan设备 ipvtap设备创建ipvlan设备
划重点:
在ipvtap设备中,看代码应该 不能创建 IPVLAN_MODE_L3S类型的ipvlan设备,因为IPVLAN_MODE_L3S类型的设备不会在netif_receive_skb函数中收包,再后面协议栈中修改dev 不会重新执行netif_receive_skb函数,跟踪代码发现无法上送skb到用户态。
代码就不做详细分析了,跟macvtap 思路一致,不想做重复性的工作

分类
未分类

ipvlan 虚拟 网卡

ipvlan虚拟网卡类似于macvlan虚拟网卡。只是macvlan虚拟网卡 每个虚拟网卡都有自己的mac地址,而 ipvlan虚拟网卡所有的网卡共用一个mac地址,通过不同的ip地址来区分数据包属于哪个网卡。有点类似于内核之前的别名接口。但是它跟别名接口最大的区别在于由于存在虚拟网卡设备,可以将不同的虚拟网卡分配到不同的命令空间,后续我们可以看到它还能够跟tap设备结合,组成 ipvtap虚拟网卡。
类似于 macvlan设备,ipvlan设备也有3中模式
L2模式,在这种模式下,ipvlan虚拟网卡能够收到广播报文,能够自己处理arp请求
L3模式,在这种模式下,ipvlan虚拟网卡不能够接受二层广播报文,arp请求由主网卡代为处理
L3S模式,在这种模式下,ipvlan虚拟网卡不能够接受二层广播报文,arp请求由主网卡代为处理。跟L3模式唯一的区别在于,L3s模式只有在报文是发往本地的时候才修改接收数据包的网卡,否则不修改,这种模式没法和tap设备结合

ipvlan设备的关键数据结构是ipvl_port结构,每个被寄生的物理设备都会有一个这样的设备,所有寄生设备都会被连接到这个结构中

struct ipvl_port {
        struct net_device       *dev;   //寄生的物理网卡
        possible_net_t          pnet;   //命令空间
        struct hlist_head       hlhead[IPVLAN_HASH_SIZE];   //根据地址查找ipvlan设备
        struct list_head        ipvlans;    //所有的ipvlan设备
        u16                     mode;
        u16                     dev_id_start;
        struct work_struct      wq;     //广播地址发送工作进程
        struct sk_buff_head     backlog;    //广播包缓冲队列
        int                     count;  //寄生的ipvlan设备数量
        struct ida              ida;
};

每个ipvlan设备私有数据结构

struct ipvl_dev {
        struct net_device       *dev;   //ipvlan设备网卡
        struct list_head        pnode;  //链表头
        struct ipvl_port        *port;  //属于哪个port
        struct net_device       *phy_dev;   //寄生的物理网卡设备
        struct list_head        addrs;  //这个ipvlan设备对应的ip地址列表
        struct ipvl_pcpu_stats  __percpu *pcpu_stats;   //统计技术
        DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE);
        netdev_features_t       sfeatures;
        u32                     msg_enable;
};

模块注册

static int __init ipvlan_init_module(void)
{
        int err;

        ipvlan_init_secret();
        register_netdevice_notifier(&ipvlan_notifier_block);    //注册网卡启停事件
        register_inet6addr_notifier(&ipvlan_addr6_notifier_block);  //注册ipv6地址加减事件
        register_inet6addr_validator_notifier(
            &ipvlan_addr6_vtor_notifier_block); //注册ipv6地址验证事件
        register_inetaddr_notifier(&ipvlan_addr4_notifier_block);   //注册ipv4地址加减事件
        register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block);    //注册ipv4地址验证事件

        err = register_pernet_subsys(&ipvlan_net_ops);  //注册网络命名空间退出和初始化结构
        if (err < 0)
                goto error;

        err = ipvlan_link_register(&ipvlan_link_ops);   //注册netlink创建网卡结构
        if (err < 0) {
                unregister_pernet_subsys(&ipvlan_net_ops);
                goto error;
        }

        return 0;
error:
        unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
        unregister_inetaddr_validator_notifier(
            &ipvlan_addr4_vtor_notifier_block);
        unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
        unregister_inet6addr_validator_notifier(
            &ipvlan_addr6_vtor_notifier_block);
        unregister_netdevice_notifier(&ipvlan_notifier_block);
        return err;
}

由于ipvlan需要根据ip地址分流,因此当虚拟网卡添加或者删除ip时,都需要在分流hash表中做相应的操作。因此需要注册 ip地址变化事件处理函数

int ipvlan_link_new(struct net *src_net, struct net_device *dev,
                    struct nlattr *tb[], struct nlattr *data[],
                    struct netlink_ext_ack *extack)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ipvl_port *port;
        struct net_device *phy_dev;
        int err;
        u16 mode = IPVLAN_MODE_L3;
        bool create = false;

        if (!tb[IFLA_LINK])
                return -EINVAL;

        phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));  //获取寄生的物理设备
        if (!phy_dev)
                return -ENODEV;

        if (netif_is_ipvlan(phy_dev)) { //如果嵌套获取最底层的物理设备
                struct ipvl_dev *tmp = netdev_priv(phy_dev);

                phy_dev = tmp->phy_dev;
        } else if (!netif_is_ipvlan_port(phy_dev)) {//如果物理设备还没被寄生过创建port
                err = ipvlan_port_create(phy_dev);
                if (err < 0)
                        return err;
                create = true;
        }

        if (data && data[IFLA_IPVLAN_MODE])
                mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); //获取创建的ipvlan设备类型

        port = ipvlan_port_get_rtnl(phy_dev);
        ipvlan->phy_dev = phy_dev;
        ipvlan->dev = dev;
        ipvlan->port = port;
        ipvlan->sfeatures = IPVLAN_FEATURES;
        ipvlan_adjust_mtu(ipvlan, phy_dev);
        INIT_LIST_HEAD(&ipvlan->addrs);

        /* If the port-id base is at the MAX value, then wrap it around and
         * begin from 0x1 again. This may be due to a busy system where lots
         * of slaves are getting created and deleted.
         */
        if (port->dev_id_start == 0xFFFE)
                port->dev_id_start = 0x1;

        /* Since L2 address is shared among all IPvlan slaves including
         * master, use unique 16 bit dev-ids to diffentiate among them.
         * Assign IDs between 0x1 and 0xFFFE (used by the master) to each
         * slave link [see addrconf_ifid_eui48()].
         */
        err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE,
                             GFP_KERNEL); 
        if (err < 0)
                err = ida_simple_get(&port->ida, 0x1, port->dev_id_start,
                                     GFP_KERNEL);
        if (err < 0)
                goto destroy_ipvlan_port;
        dev->dev_id = err;
        /* Increment id-base to the next slot for the future assignment */
        port->dev_id_start = err + 1;

        /* TODO Probably put random address here to be presented to the
         * world but keep using the physical-dev address for the outgoing
         * packets.
         */
        memcpy(dev->dev_addr, phy_dev->dev_addr, ETH_ALEN); //设置mac地址

        dev->priv_flags |= IFF_IPVLAN_SLAVE;

        err = register_netdevice(dev);
        if (err < 0)
                goto remove_ida;

        err = netdev_upper_dev_link(phy_dev, dev);
        if (err) {
                goto unregister_netdev;
        }
        err = ipvlan_set_port_mode(port, mode); //设置ipvlan模式,所有寄居在同一个物理设备上的ipvlan设备必须是同一个模式
        if (err) {
                goto unlink_netdev;
        }

        list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);//添加ipvlan到port中
        netif_stacked_transfer_operstate(phy_dev, dev);
        return 0;

unlink_netdev:
        netdev_upper_dev_unlink(phy_dev, dev);
unregister_netdev:
        unregister_netdevice(dev);
remove_ida:
        ida_simple_remove(&port->ida, dev->dev_id);
destroy_ipvlan_port:
        if (create)
                ipvlan_port_destroy(phy_dev);
        return err;
}

打开设备 函数

static int ipvlan_open(struct net_device *dev)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct net_device *phy_dev = ipvlan->phy_dev;
        struct ipvl_addr *addr;

        if (ipvlan->port->mode == IPVLAN_MODE_L3 ||
            ipvlan->port->mode == IPVLAN_MODE_L3S)
                dev->flags |= IFF_NOARP;
        else
                dev->flags &= ~IFF_NOARP;   //只L2模式下 需要回复 arp报文

        list_for_each_entry(addr, &ipvlan->addrs, anode)
                ipvlan_ht_addr_add(ipvlan, addr);   //将这个设备关联的ip地址添加到查找hash表中。

        return dev_uc_add(phy_dev, phy_dev->dev_addr);
}

数据流 接受数据包

rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
{       
        struct sk_buff *skb = *pskb;
        struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev);

        if (!port)
                return RX_HANDLER_PASS;

        switch (port->mode) {
        case IPVLAN_MODE_L2:
                return ipvlan_handle_mode_l2(pskb, port);   //L2 模式收包函数
        case IPVLAN_MODE_L3:
                return ipvlan_handle_mode_l3(pskb, port);   //L3模式收包函数
        case IPVLAN_MODE_L3S:
                return RX_HANDLER_PASS;     //L3s模式,在走协议栈时不改变dev,只有在最后local_in链表中改变
        }

        /* Should not reach here */
        WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n",
                          port->mode);
        kfree_skb(skb);
        return RX_HANDLER_CONSUMED;
} 
static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb,
                                                 struct ipvl_port *port)
{               
        struct sk_buff *skb = *pskb;
        struct ethhdr *eth = eth_hdr(skb);
        rx_handler_result_t ret = RX_HANDLER_PASS;
        void *lyr3h;
        int addr_type;

        if (is_multicast_ether_addr(eth->h_dest)) {     //广播地址处理流程
                if (ipvlan_external_frame(skb, port)) {
                        struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

                        /* External frames are queued for device local
                         * distribution, but a copy is given to master
                         * straight away to avoid sending duplicates later
                         * when work-queue processes this frame. This is
                         * achieved by returning RX_HANDLER_PASS.
                         */
                        if (nskb) {
                                ipvlan_skb_crossing_ns(nskb, NULL);
                                ipvlan_multicast_enqueue(port, nskb, false);//添加到广播数据包队里中,等待广播地址处理工作队列处理。
                        }
                }
        } else {
                struct ipvl_addr *addr;

                lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
                if (!lyr3h)
                        return ret;

                addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);    //在ip地址hash表中通过地址找到收包设备,
                if (addr)
                        ret = ipvlan_rcv_frame(addr, pskb, false);  //修改数据包的收包设备为找到的ipvlan设备 返回RX_HANDLER_ANOTHER
        }

        return ret;
}
static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb,
                                                 struct ipvl_port *port)
{       
        void *lyr3h;
        int addr_type;
        struct ipvl_addr *addr;
        struct sk_buff *skb = *pskb;
        rx_handler_result_t ret = RX_HANDLER_PASS;

        lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
        if (!lyr3h)
                goto out;

        addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);    //通过地址找到收包设备
        if (addr)
                ret = ipvlan_rcv_frame(addr, pskb, false);  //改变数据包的收包设备,然后返回RX_HANDLER_ANOTHER

out:    
        return ret;
}

数据流 发包流程

static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb,
                                     struct net_device *dev)
{
        const struct ipvl_dev *ipvlan = netdev_priv(dev);
        int skblen = skb->len;
        int ret;

        ret = ipvlan_queue_xmit(skb, dev);  //发送数据包
        if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
                struct ipvl_pcpu_stats *pcptr;

                pcptr = this_cpu_ptr(ipvlan->pcpu_stats);

                u64_stats_update_begin(&pcptr->syncp);
                pcptr->tx_pkts++;
                pcptr->tx_bytes += skblen;
                u64_stats_update_end(&pcptr->syncp);
        } else {
                this_cpu_inc(ipvlan->pcpu_stats->tx_drps);
        }
        return ret;
}
int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{                                    
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ipvl_port *port = ipvlan_port_get_rcu_bh(ipvlan->phy_dev);

        if (!port)
                goto out;

        if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
                goto out;

        switch(port->mode) {
        case IPVLAN_MODE_L2:
                return ipvlan_xmit_mode_l2(skb, dev);   //L2模式发包流程
        case IPVLAN_MODE_L3:
        case IPVLAN_MODE_L3S:
                return ipvlan_xmit_mode_l3(skb, dev);   //L3模式发包流程
        } 

        /* Should not reach here */
        WARN_ONCE(true, "ipvlan_queue_xmit() called for mode = [%hx]\n",
                          port->mode);
out:
        kfree_skb(skb);
        return NET_XMIT_DROP;                
}
static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
{
        const struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ethhdr *eth = eth_hdr(skb);
        struct ipvl_addr *addr;
        void *lyr3h;
        int addr_type;

        if (ether_addr_equal(eth->h_dest, eth->h_source)) { //发送给自己的数据包
                lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
                if (lyr3h) {
                        addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
                        if (addr)
                                return ipvlan_rcv_frame(addr, &skb, true);  //直接通过ipvlan设备上送
                }
                skb = skb_share_check(skb, GFP_ATOMIC);
                if (!skb)
                        return NET_XMIT_DROP;

                /* Packet definitely does not belong to any of the
                 * virtual devices, but the dest is local. So forward
                 * the skb for the main-dev. At the RX side we just return
                 * RX_PASS for it to be processed further on the stack.
                 */
                return dev_forward_skb(ipvlan->phy_dev, skb);   //通过物理设备上送

        } else if (is_multicast_ether_addr(eth->h_dest)) {  //广播报文
                ipvlan_skb_crossing_ns(skb, NULL);
                ipvlan_multicast_enqueue(ipvlan->port, skb, true);  //添加到广播报文缓冲队列中
                return NET_XMIT_SUCCESS;
        }

        ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
        return dev_queue_xmit(skb); //通过物理设备发送出去
}
static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
{       
        const struct ipvl_dev *ipvlan = netdev_priv(dev);
        void *lyr3h;
        struct ipvl_addr *addr;
        int addr_type;

        lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
        if (!lyr3h)
                goto out;

        addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
        if (addr)
                return ipvlan_rcv_frame(addr, &skb, true);  //通过ipvlan设备发送

out:    
        ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
        return ipvlan_process_outbound(skb);    //通过查找路由,发送数据包
}
static int ipvlan_process_outbound(struct sk_buff *skb)
{
        struct ethhdr *ethh = eth_hdr(skb);
        int ret = NET_XMIT_DROP;

        /* In this mode we dont care about multicast and broadcast traffic */
        if (is_multicast_ether_addr(ethh->h_dest)) {    //广播地址直接丢弃,不处理广播
                pr_warn_ratelimited("Dropped {multi|broad}cast of type= [%x]\n",
                                    ntohs(skb->protocol));
                kfree_skb(skb);
                goto out;
        }

        /* The ipvlan is a pseudo-L2 device, so the packets that we receive
         * will have L2; which need to discarded and processed further
         * in the net-ns of the main-device.
         */
        if (skb_mac_header_was_set(skb)) {
                skb_pull(skb, sizeof(*ethh));
                skb->mac_header = (typeof(skb->mac_header))~0U;
                skb_reset_network_header(skb);
        }

        if (skb->protocol == htons(ETH_P_IPV6))
                ret = ipvlan_process_v6_outbound(skb);  //通过ipv6的outbound发送
        else if (skb->protocol == htons(ETH_P_IP))
                ret = ipvlan_process_v4_outbound(skb);  //通过ipv4的outbound发送
        else {
                pr_warn_ratelimited("Dropped outbound packet type=%x\n",
                                    ntohs(skb->protocol));
                kfree_skb(skb);
        }
out:
        return ret;
}
static int ipvlan_process_v4_outbound(struct sk_buff *skb)
{
        const struct iphdr *ip4h = ip_hdr(skb);
        struct net_device *dev = skb->dev;
        struct net *net = dev_net(dev);
        struct rtable *rt;
        int err, ret = NET_XMIT_DROP;
        struct flowi4 fl4 = {
                .flowi4_oif = dev->ifindex,
                .flowi4_tos = RT_TOS(ip4h->tos),
                .flowi4_flags = FLOWI_FLAG_ANYSRC,
                .daddr = ip4h->daddr,
                .saddr = ip4h->saddr,
        };

        rt = ip_route_output_flow(net, &fl4, NULL); //查找路由
        if (IS_ERR(rt))
                goto err;

        if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
                ip_rt_put(rt);
                goto err;
        }
        skb_dst_set(skb, &rt->dst);
        err = ip_local_out(net, skb->sk, skb);  //通过ip_local_out发送数据包,走路由流程发送
        if (unlikely(net_xmit_eval(err)))
                dev->stats.tx_errors++;
        else
                ret = NET_XMIT_SUCCESS;
        goto out;
err:
        dev->stats.tx_errors++;
        kfree_skb(skb);
out:
        return ret;
}

ipvlan设备相当于macvlan设备的扩充,L3模式在发送数据包时 不一定会走ipvlan宿主设备,可能通过路由走其他设备发送出去。

分类
未分类

macvtap 网卡

  • macvtap设备和macvlan设备的区别
    macvtap设备是基于macvlan设备的。即每次创建一个macvtap设备必定会创建一个macvlan设备
    macvtap设备和macvlan设备最大的区别是数据包的来源和去向
    macvtap设备会关联一个字符设备,通过macvlan设备收上来的数据包直接扔到 字符设备的缓冲区里面,等待字符设备读取。字符设备发送下来的包直接通过关联的macvlan设备发送出去。
    macvtap 设备收到的数据包直接上送到用户态,用户态程序通过macvtap设备关联的字符设备直接发送数据包。
    macvlan 设备收到的数据包上送到协议栈处理。然后协议栈通过发包接口调用macvlan设备发送数据包
  • macvtap设备和tun/tap设备的区别
    macvtap设备是将macvlan网卡设备收到的包通过字符设备上送到用户态,用户态写字符设备通过macvtap设备将数据从关联的macvlan设备发送出去。
    tun/tap设备协议栈通过tun/tap设备的虚拟网卡,将发送的数据包通过字符设备上送到用户态,用户态写字符设备通过tun/tap设备的虚拟网卡,将数据包上送到协议栈。
    总结起来就是,tun/tap设备将虚拟网卡发送的数据包上送到用户态, macvtap设备将虚拟macvlan网卡接受的数据包上送到用户态。tun/tap设备将用户下发的数据包上送到协议栈,macvtap设备将用户态下发的数据包,通过虚拟macvlan设备发送出去。

私有数据结构,这个结构会被当做创建的macvlan网卡设备的私有数据。

struct macvtap_dev {
        struct macvlan_dev vlan; //macvlan设备的私有结构 上结讨论过
        struct tap_dev    tap;   //macvtap的私有结构。
};
struct tap_dev {
        struct net_device       *dev;
        u16                     flags;
        /* This array tracks active taps. */
        struct tap_queue    __rcu *taps[MAX_TAP_QUEUES];    //收包文件队列
        /* This list tracks all taps (both enabled and disabled) */
        struct list_head        queue_list;
        int                     numvtaps;
        int                     numqueues;
        netdev_features_t       tap_features;
        int                     minor;

        void (*update_features)(struct tap_dev *tap, netdev_features_t features);
        void (*count_tx_dropped)(struct tap_dev *tap);
        void (*count_rx_dropped)(struct tap_dev *tap);
};
struct tap_queue {
        struct sock sk;
        struct socket sock;
        struct socket_wq wq;
        int vnet_hdr_sz;
        struct tap_dev __rcu *tap;
        struct file *file;
        unsigned int flags;
        u16 queue_index;
        bool enabled;
        struct list_head next;
        struct skb_array skb_array; //数据包缓冲区
};

模块注册

static int macvtap_init(void)
{
        int err;

        err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap");//注册字符设备需要使用的 major 和minor
        if (err)
                goto out1;

        err = class_register(&macvtap_class);//在sys文件系统中注册macvtap class
        if (err)
                goto out2;

        err = register_netdevice_notifier(&macvtap_notifier_block);//注册网卡设备事件消息回调函数
        if (err)
                goto out3;

        err = macvlan_link_register(&macvtap_link_ops);//注册macvtap netlink 结构,供ip link 调用创建macvtap设备
        if (err)
                goto out4;

        return 0;

out4:
        unregister_netdevice_notifier(&macvtap_notifier_block);
out3:
        class_unregister(&macvtap_class);
out2:
        tap_destroy_cdev(macvtap_major, &macvtap_cdev);
out1:
        return err;
}

macvtap设备创建

static void macvtap_setup(struct net_device *dev)
{
        macvlan_common_setup(dev); //调用macvlan初始化,网卡设备
        dev->tx_queue_len = TUN_READQ_SIZE;
}
static int macvtap_newlink(struct net *src_net, struct net_device *dev,
                           struct nlattr *tb[], struct nlattr *data[],
                           struct netlink_ext_ack *extack)
{
        struct macvtap_dev *vlantap = netdev_priv(dev);
        int err;

        INIT_LIST_HEAD(&vlantap->tap.queue_list);

        /* Since macvlan supports all offloads by default, make
         * tap support all offloads also.
         */
        vlantap->tap.tap_features = TUN_OFFLOADS;

        /* Register callbacks for rx/tx drops accounting and updating
         * net_device features
         */
        vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped;
        vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped;
        vlantap->tap.update_features  = macvtap_update_features;

        err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);//macvtap设备rx_handler函数
        if (err)
                return err;

        /* Don't put anything that may fail after macvlan_common_newlink
         * because we can't undo what it does.
         */
        err = macvlan_common_newlink(src_net, dev, tb, data);//调用macvlan注册新网卡设备
        if (err) {
                netdev_rx_handler_unregister(dev);
                return err;
        }

        vlantap->tap.dev = vlantap->vlan.dev;

        return 0;
}

网卡注册时会触发 网卡注册事件,最后回调macvtap_device_event函数。

static int macvtap_device_event(struct notifier_block *unused,
                                unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct macvtap_dev *vlantap;
        struct device *classdev;
        dev_t devt;
        int err;
        char tap_name[IFNAMSIZ];

        if (dev->rtnl_link_ops != &macvtap_link_ops) //如果不是macvtap网卡设备注册不管
                return NOTIFY_DONE;

        snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
        vlantap = netdev_priv(dev);

        switch (event) {
        case NETDEV_REGISTER:
                /* Create the device node here after the network device has
                 * been registered but before register_netdevice has
                 * finished running.
                 */
                err = tap_get_minor(macvtap_major, &vlantap->tap);//获取即将创建的字符设备的minor号,并将字符设备和网卡设备给关联
                if (err)
                        return notifier_from_errno(err);

                devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
                classdev = device_create(&macvtap_class, &dev->dev, devt,
                                         dev, tap_name);//创建macvtap对应的字符设备
                if (IS_ERR(classdev)) {
                        tap_free_minor(macvtap_major, &vlantap->tap);
                        return notifier_from_errno(PTR_ERR(classdev));
                }
                err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
                                        tap_name);
                if (err)
                        return notifier_from_errno(err);
                break;
        case NETDEV_UNREGISTER:
                /* vlan->minor == 0 if NETDEV_REGISTER above failed */
                if (vlantap->tap.minor == 0)
                        break;
                sysfs_remove_link(&dev->dev.kobj, tap_name);
                devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
                device_destroy(&macvtap_class, devt);
                tap_free_minor(macvtap_major, &vlantap->tap);
                break;
        case NETDEV_CHANGE_TX_QUEUE_LEN:
                if (tap_queue_resize(&vlantap->tap))
                        return NOTIFY_BAD;
                break;
        }

        return NOTIFY_DONE;
}

打开字符设备,用于用户态通讯

static int tap_open(struct inode *inode, struct file *file)
{
        struct net *net = current->nsproxy->net_ns;
        struct tap_dev *tap;
        struct tap_queue *q;
        int err = -ENODEV;

        rtnl_lock();
        tap = dev_get_by_tap_file(imajor(inode), iminor(inode)); //通过major和minor号获取对应的tap
        if (!tap)
                goto err;

        err = -ENOMEM;
        q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
                                             &tap_proto, 0); //创建tap_queue每次打开字符设备多一个队列
        if (!q)
                goto err;

        RCU_INIT_POINTER(q->sock.wq, &q->wq);
        init_waitqueue_head(&q->wq.wait);
        q->sock.type = SOCK_RAW;
        q->sock.state = SS_CONNECTED;
        q->sock.file = file;
        q->sock.ops = &tap_socket_ops;
        sock_init_data(&q->sock, &q->sk);
        q->sk.sk_write_space = tap_sock_write_space;
        q->sk.sk_destruct = tap_sock_destruct;
        q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
        q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);

        /*
         * so far only KVM virtio_net uses tap, enable zero copy between
         * guest kernel and host kernel when lower device supports zerocopy
         *
         * The macvlan supports zerocopy iff the lower device supports zero
         * copy so we don't have to look at the lower device directly.
         */
        if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG))
                sock_set_flag(&q->sk, SOCK_ZEROCOPY);

        err = -ENOMEM;
        if (skb_array_init(&q->skb_array, tap->dev->tx_queue_len, GFP_KERNEL))
                goto err_array;

        err = tap_set_queue(tap, file, q); //将队列和tap设备关联。
        if (err)
                goto err_queue;

        dev_put(tap->dev);

        rtnl_unlock();
        return err;

err_queue:
        skb_array_cleanup(&q->skb_array);
err_array:
        sock_put(&q->sk);
err:
        if (tap)
                dev_put(tap->dev);

        rtnl_unlock();
        return err;
}

至此,数据通路已经完全打开。
接受数据流程
在netif_receive_skb函数中调用 tap_handle_frame函数

rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
{
        struct sk_buff *skb = *pskb;
        struct net_device *dev = skb->dev;
        struct tap_dev *tap;
        struct tap_queue *q;
        netdev_features_t features = TAP_FEATURES;

        tap = tap_dev_get_rcu(dev);//获取设备关联的tap结构
        if (!tap)
                return RX_HANDLER_PASS;

        q = tap_get_queue(tap, skb);//根据hash计算使用哪个缓冲队列
        if (!q)
                return RX_HANDLER_PASS;

        if (__skb_array_full(&q->skb_array))
                goto drop;

        skb_push(skb, ETH_HLEN);

        /* Apply the forward feature mask so that we perform segmentation
         * according to users wishes.  This only works if VNET_HDR is
         * enabled.
         */
        if (q->flags & IFF_VNET_HDR)
                features |= tap->tap_features;
        if (netif_needs_gso(skb, features)) {
                struct sk_buff *segs = __skb_gso_segment(skb, features, false);

                if (IS_ERR(segs))
                        goto drop;

                if (!segs) {
                        if (skb_array_produce(&q->skb_array, skb)) //将数据放入缓冲队列中
                                goto drop;
                        goto wake_up;
                }

                consume_skb(skb);
                while (segs) {
                        struct sk_buff *nskb = segs->next;

                        segs->next = NULL;
                        if (skb_array_produce(&q->skb_array, segs)) {
                                kfree_skb(segs);
                                kfree_skb_list(nskb);
                                break;
                        }
                        segs = nskb;
                }
        } else {
                /* If we receive a partial checksum and the tap side
                 * doesn't support checksum offload, compute the checksum.
                 * Note: it doesn't matter which checksum feature to
                 *        check, we either support them all or none.
                 */
                if (skb->ip_summed == CHECKSUM_PARTIAL &&
                    !(features & NETIF_F_CSUM_MASK) &&
                    skb_checksum_help(skb))
                        goto drop;
                if (skb_array_produce(&q->skb_array, skb)) //将数据放入缓冲队列中
                        goto drop;
        }

wake_up:
        wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND); //唤醒等待读数据的进程
        return RX_HANDLER_CONSUMED;

drop:
        /* Count errors/drops only here, thus don't care about args. */
        if (tap->count_rx_dropped)
                tap->count_rx_dropped(tap);
        kfree_skb(skb);
        return RX_HANDLER_CONSUMED;
}

用户态调用tap_read_iter读取数据,tap_do_read 函数跟 tun/tap中函数类似,都是判断缓冲区是否有数据,如果没有数据就等待,如果有数据返回数据

static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct tap_queue *q = file->private_data;
        ssize_t len = iov_iter_count(to), ret;

        ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK, NULL); //从缓冲区获取数据
        ret = min_t(ssize_t, ret, len);
        if (ret > 0)
                iocb->ki_pos = ret;
        return ret;
}

发送数据流程

static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct tap_queue *q = file->private_data;

        return tap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK); 发送数据
}
static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
                            struct iov_iter *from, int noblock)
{
        int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
        struct sk_buff *skb;
        struct tap_dev *tap;
        unsigned long total_len = iov_iter_count(from);
        unsigned long len = total_len;
        int err;
        struct virtio_net_hdr vnet_hdr = { 0 };
        int vnet_hdr_len = 0;
        int copylen = 0;
        int depth;
        bool zerocopy = false;
        size_t linear;

        if (q->flags & IFF_VNET_HDR) {
                vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);

                err = -EINVAL;
                if (len < vnet_hdr_len)
                        goto err;
                len -= vnet_hdr_len;

                err = -EFAULT;
                if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
                        goto err;
                iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
                if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
                     tap16_to_cpu(q, vnet_hdr.csum_start) +
                     tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
                             tap16_to_cpu(q, vnet_hdr.hdr_len))
                        vnet_hdr.hdr_len = cpu_to_tap16(q,
                                 tap16_to_cpu(q, vnet_hdr.csum_start) +
                                 tap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
                err = -EINVAL;
                if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len)
                        goto err;
        }

        err = -EINVAL;
        if (unlikely(len < ETH_HLEN))
                goto err;

        if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
                struct iov_iter i;

                copylen = vnet_hdr.hdr_len ?
                        tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
                if (copylen > good_linear)
                        copylen = good_linear;
                else if (copylen < ETH_HLEN)
                        copylen = ETH_HLEN;
                linear = copylen;
                i = *from;
                iov_iter_advance(&i, copylen);
                if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
                        zerocopy = true;
        }

        if (!zerocopy) {
                copylen = len;
                linear = tap16_to_cpu(q, vnet_hdr.hdr_len);
                if (linear > good_linear)
                        linear = good_linear;
                else if (linear < ETH_HLEN)
                        linear = ETH_HLEN;
        }

        skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen,
                            linear, noblock, &err);//分配skb
        if (!skb)
                goto err;

        if (zerocopy)
                err = zerocopy_sg_from_iter(skb, from);
        else
                err = skb_copy_datagram_from_iter(skb, 0, from, len); //获取数据

        if (err)
                goto err_kfree;

        skb_set_network_header(skb, ETH_HLEN);
        skb_reset_mac_header(skb); //设置
        skb->protocol = eth_hdr(skb)->h_proto;

        if (vnet_hdr_len) {
                err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
                                            tap_is_little_endian(q));
                if (err)
                        goto err_kfree;
        }

        skb_probe_transport_header(skb, ETH_HLEN);

        /* Move network header to the right position for VLAN tagged packets */
        if ((skb->protocol == htons(ETH_P_8021Q) ||
             skb->protocol == htons(ETH_P_8021AD)) &&
            __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
                skb_set_network_header(skb, depth);

        rcu_read_lock();
        tap = rcu_dereference(q->tap);
        /* copy skb_ubuf_info for callback when skb has no error */
        if (zerocopy) {
                skb_shinfo(skb)->destructor_arg = m->msg_control;
                skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
                skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
        } else if (m && m->msg_control) {
                struct ubuf_info *uarg = m->msg_control;
                uarg->callback(uarg, false);
        }

        if (tap) {
                skb->dev = tap->dev; //设置发送使用的网卡
                dev_queue_xmit(skb); //发送数据包,最后调用macvlan的发送接口发送数据包
        } else {
                kfree_skb(skb);
        }
        rcu_read_unlock();

        return total_len;

err_kfree:
        kfree_skb(skb);

err:
        rcu_read_lock();
        tap = rcu_dereference(q->tap);
        if (tap && tap->count_tx_dropped)
                tap->count_tx_dropped(tap);
        rcu_read_unlock();

        return err;
}

至此, macvtap数据发送流程结束。

macvtap在vpn和 虚拟化中应用很广。它主要功能是将macvlan设备获取的数据包直接上送到用户态而不是协议栈。

分类
linux virtual nic 未分类

macvlan 虚拟网卡

macvlan 虚拟网卡设备
macvlan虚拟网卡设备时寄生在物理网卡设备上的。发包时调用自己的发包函数,查找到寄生的物理设备,然后通过物理设备发包。收包时,通过注册寄生的物理设备的rx_handler回调函数,处理数据包。
macvlan 虚拟网卡设备包括5种模式
private 模式:在这种模式下,macvlan设备不能接受寄生在同一个物理网卡的其他macvlan设备的数据包,即使是其他macvlan设备通过物理网卡发送出去并通过hairpin设备返回的包
vepa 模式:在这种模式下,macvlan设备不能直接接受寄生在同一个物理网卡的其他macvlan设备的数据包,但是其他macvlan设备可以将数据包通过物理网卡发送出去,然后通过hairpin设备返回的给其他macvlan设备
passthru 模式:在这种模式下,每一个物理设备只能寄生一个macvlan设备
bridge 模式:在这种模式下,寄生在同一个物理设备的macvlan设备可以直接通讯,不需要外接的hairpin设备帮助
source 模式: 在这种模式下,寄生在物理设备的这类macvlan设备,只能接受指定的源 mac source的数据包,其他数据包都不接受。

macvlan设备 关键数据结构
macvlan_port ,这个数据时在注册rx_handler时使用,作为回调函数的参数。

struct macvlan_port {
        struct net_device       *dev;                           //物理设备
        struct hlist_head       vlan_hash[MACVLAN_HASH_SIZE];   //macvlan设备私有数据
        struct list_head        vlans;                          //macvlan设备私有数据
        struct sk_buff_head     bc_queue;                       //广播报文队列
        struct work_struct      bc_work;                        //发送广播报文进程
        u32                     flags;                          //标志
        int                     count;                          //macvlan设备数量
        struct hlist_head       vlan_source_hash[MACVLAN_HASH_SIZE];    //mac vlan source类型设备专用
        DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
        unsigned char           perm_addr[ETH_ALEN];
};

macvlan_dev ,这个数据结构,时macvlan网卡的私有数据结构,每创建一个macvlan设备就会创建一个设备,并将这个数据结构挂在 macvlan_port 数据结构上。

struct macvlan_dev {
        struct net_device       *dev;           //macvlan网卡设备
        struct list_head        list;           //寄生的macvlan链表
        struct hlist_node       hlist;          //寄生的macvlanhash表
        struct macvlan_port     *port;          //macvlan_port
        struct net_device       *lowerdev;      //寄生的物理设备
        void                    *fwd_priv;      //如果物理网卡支持可以硬件加速
        struct vlan_pcpu_stats __percpu *pcpu_stats;

        DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);

        netdev_features_t       set_features;
        enum macvlan_mode       mode;
        u16                     flags;
        /* This array tracks active taps. */
        struct tap_queue        __rcu *taps[MAX_TAP_QUEUES];
        /* This list tracks all taps (both enabled and disabled) */
        struct list_head        queue_list;
        int                     numvtaps;
        int                     numqueues;
        netdev_features_t       tap_features;
        int                     minor;
        int                     nest_level;
#ifdef CONFIG_NET_POLL_CONTROLLER
        struct netpoll          *netpoll;
#endif
        unsigned int            macaddr_count;
};

两个私有结构之间的关系

digraph tun{
    node [shape = plaintext]
    rankdir = LR
    macvlan_port[label=<
            <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
            <TR><TD BORDER="0" ALIGN="CENTER">struct macvlan_port</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f0">struct net_device *dev</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f1">struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]</TD></TR>
            <TR><TD ALIGN="CENTER">struct list_head vlans</TD></TR>
            <TR><TD ALIGN="LEFT">struct sk_buff_head bc_queue</TD></TR>
            <TR><TD ALIGN="CENTER">struct work_struct bc_work</TD></TR>
            <TR><TD ALIGN="LEFT">u32 flags</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f2">struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            </TABLE>>]
    macvlan_dev[label=<
            <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
            <TR><TD BORDER="0" ALIGN="CENTER">struct macvlan_dev</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f0">strut net_device *dev</TD></TR>
            <TR><TD ALIGN="LEFT">struct list_head list</TD></TR>
            <TR><TD ALIGN="LEFT">struct hlist_node hlist</TD></TR>
            <TR><TD ALIGN="LEFT">struct macvlan_port *port</TD></TR>
            <TR><TD ALIGN="LEFT">struct net_device *lowerdev</TD></TR>
            <TR><TD ALIGN="LEFT">void *fwd_priv</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            </TABLE>>]
    macvlan_port:f1 -> macvlan_dev:f0
    macvlan_port:f2 -> macvlan_dev:f0
}

模块注册

static int __init macvlan_init_module(void)
{
        int err;

        register_netdevice_notifier(&macvlan_notifier_block);   //注册网卡通知

        err = macvlan_link_register(&macvlan_link_ops);     //注册netlink方法
        if (err < 0)
                goto err1;
        return 0;
err1:
        unregister_netdevice_notifier(&macvlan_notifier_block);
        return err;
}

创建macvlan设备

int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
                           struct nlattr *tb[], struct nlattr *data[])
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct macvlan_port *port;
        struct net_device *lowerdev;
        int err;
        int macmode;
        bool create = false;

        if (!tb[IFLA_LINK])
                return -EINVAL;

        lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
        if (lowerdev == NULL)
                return -ENODEV;

        /* When creating macvlans or macvtaps on top of other macvlans - use
         * the real device as the lowerdev.
         */
        if (netif_is_macvlan(lowerdev))
                lowerdev = macvlan_dev_real_dev(lowerdev);  //获取寄生的物理设备

        if (!tb[IFLA_MTU])
                dev->mtu = lowerdev->mtu;
        else if (dev->mtu > lowerdev->mtu)
                return -EINVAL;

        /* MTU range: 68 - lowerdev->max_mtu */
        dev->min_mtu = ETH_MIN_MTU;
        dev->max_mtu = lowerdev->max_mtu;

        if (!tb[IFLA_ADDRESS])
                eth_hw_addr_random(dev);

        if (!macvlan_port_exists(lowerdev)) {
                err = macvlan_port_create(lowerdev);    //如果物理设备还有寄生的macvlan设备创建macvlan_port结构,并注册rx_handler
                if (err < 0)
                        return err;
                create = true;
        }
        port = macvlan_port_get_rtnl(lowerdev);

        /* Only 1 macvlan device can be created in passthru mode */
        if (macvlan_passthru(port)) {   //判断是否有 passthru类型macvlan,如果有退出
                /* The macvlan port must be not created this time,
                 * still goto destroy_macvlan_port for readability.
                 */
                err = -EINVAL;
                goto destroy_macvlan_port;
        }

        vlan->lowerdev = lowerdev;
        vlan->dev      = dev;
        vlan->port     = port;
        vlan->set_features = MACVLAN_FEATURES;
        vlan->nest_level = dev_get_nest_level(lowerdev) + 1;

        vlan->mode     = MACVLAN_MODE_VEPA;
        if (data && data[IFLA_MACVLAN_MODE])
                vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);

        if (data && data[IFLA_MACVLAN_FLAGS])
                vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);

        if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
                if (port->count) {          //如果新的macvlan设备时prassthru类型,且被寄生的物理设备上已经存在macvlan设备,直接退出失败
                        err = -EINVAL;
                        goto destroy_macvlan_port;
                }
                macvlan_set_passthru(port);
                eth_hw_addr_inherit(dev, lowerdev);
        }

        if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
                if (vlan->mode != MACVLAN_MODE_SOURCE) {
                        err = -EINVAL;
                        goto destroy_macvlan_port;
                }
                macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]);
                err = macvlan_changelink_sources(vlan, macmode, data);  //如果设备时 source类型的设备,添加想收的源mac地址,其他地址。这种类型的macvlan设备只收指定源mac包
                if (err)
                        goto destroy_macvlan_port;
        }

        err = register_netdevice(dev);
        if (err < 0)
                goto destroy_macvlan_port;

        dev->priv_flags |= IFF_MACVLAN;
        err = netdev_upper_dev_link(lowerdev, dev);
        if (err)
                goto unregister_netdev;

        list_add_tail_rcu(&vlan->list, &port->vlans);   //将macvlan设备添加到列表中
        netif_stacked_transfer_operstate(lowerdev, dev);
        linkwatch_fire_event(dev);

        return 0;

unregister_netdev:
        unregister_netdevice(dev);
destroy_macvlan_port:
        if (create)
                macvlan_port_destroy(port->dev);
        return err;
}

收包逻辑

static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
{
        struct macvlan_port *port;
        struct sk_buff *skb = *pskb;
        const struct ethhdr *eth = eth_hdr(skb);
        const struct macvlan_dev *vlan;
        const struct macvlan_dev *src;
        struct net_device *dev;
        unsigned int len = 0;
        int ret;
        rx_handler_result_t handle_res;

        port = macvlan_port_get_rcu(skb->dev);
        if (is_multicast_ether_addr(eth->h_dest)) { //广播报文处理逻辑
                unsigned int hash;

                skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN);
                if (!skb)
                        return RX_HANDLER_CONSUMED;
                *pskb = skb;
                eth = eth_hdr(skb);
                macvlan_forward_source(skb, port, eth->h_source);//source 类型macvlan设备收包函数,
                src = macvlan_hash_lookup(port, eth->h_source);//查找发包设备是不是寄生在同一个物理网卡上的macvlan设备
                if (src && src->mode != MACVLAN_MODE_VEPA &&
                    src->mode != MACVLAN_MODE_BRIDGE) { //如果发包设备是寄生在同一个物理网卡上的macvlan设备,且设备类型是 source private passthru类型.则给自己发送广播报文
                        /* forward to original port. */
                        vlan = src;
                        ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?:
                              netif_rx(skb);
                        handle_res = RX_HANDLER_CONSUMED;
                        goto out;
                }

                hash = mc_hash(NULL, eth->h_dest);
                if (test_bit(hash, port->mc_filter))
                        macvlan_broadcast_enqueue(port, src, skb);  //添加到发送队列中,通过工作队列发送广播

                return RX_HANDLER_PASS; //给物理网卡上送广播报文
        }

        macvlan_forward_source(skb, port, eth->h_source); //单播报文source类型收包
        if (macvlan_passthru(port))
                vlan = list_first_or_null_rcu(&port->vlans,
                                              struct macvlan_dev, list);
        else
                vlan = macvlan_hash_lookup(port, eth->h_dest); //查找目的地址对应的macvlan设备
        if (vlan == NULL)
                return RX_HANDLER_PASS; //找不到,继续走物理网卡逻辑

        dev = vlan->dev;
        if (unlikely(!(dev->flags & IFF_UP))) {
                kfree_skb(skb);
                return RX_HANDLER_CONSUMED; //找到了网卡没启动,释放skb
        }
        len = skb->len + ETH_HLEN;
        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb) {
                ret = NET_RX_DROP;
                handle_res = RX_HANDLER_CONSUMED;
                goto out;
        }

        *pskb = skb;
        skb->dev = dev;
        skb->pkt_type = PACKET_HOST; //找到了并且网卡没有问题,设置为本机

        ret = NET_RX_SUCCESS;
        handle_res = RX_HANDLER_ANOTHER;    //走macvlan 网卡逻辑
out:
        macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false);
        return handle_res;
}

发送工作队列,工作函数

static void macvlan_process_broadcast(struct work_struct *w)
{       
        struct macvlan_port *port = container_of(w, struct macvlan_port,
                                                 bc_work);
        struct sk_buff *skb;
        struct sk_buff_head list;

        __skb_queue_head_init(&list);

        spin_lock_bh(&port->bc_queue.lock);
        skb_queue_splice_tail_init(&port->bc_queue, &list);
        spin_unlock_bh(&port->bc_queue.lock);

        while ((skb = __skb_dequeue(&list))) {
                const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src;

                rcu_read_lock();

                if (!src) //如果不是寄生在同一个 物理设备上的macvlan发送,则所有macvlan都 收包
                        /* frame comes from an external address */
                        macvlan_broadcast(skb, port, NULL,
                                          MACVLAN_MODE_PRIVATE |
                                          MACVLAN_MODE_VEPA    |
                                          MACVLAN_MODE_PASSTHRU|
                                          MACVLAN_MODE_BRIDGE);
                else if (src->mode == MACVLAN_MODE_VEPA)//如果是寄生的同一个物理设备上的VEPA类型macvlan发送,则VEPA类型和BRIDGE类型macvlan都受到,private不收
                        /* flood to everyone except source */
                        macvlan_broadcast(skb, port, src->dev,
                                          MACVLAN_MODE_VEPA |
                                          MACVLAN_MODE_BRIDGE);
                else    //如果是寄生在同一个物理设备上的BRIDGE类型macvlan发送,则VEPA类型macvlan受到,private 和 bridge类型不收, bridge不收是因为在发送的时候已经上送给它了。
                        /*
                         * flood only to VEPA ports, bridge ports
                         * already saw the frame on the way out.
                         */
                        macvlan_broadcast(skb, port, src->dev,
                                          MACVLAN_MODE_VEPA);

                rcu_read_unlock();

                if (src)
                        dev_put(src->dev);
                kfree_skb(skb);
        }
}

通过控制广播报文的收发,macvlan保证寄生在同一个物理设备的private设备不能相互收到包。arp协议基于广播报文。

发包流程

static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
                                      struct net_device *dev)
{                       
        unsigned int len = skb->len;
        int ret;
        struct macvlan_dev *vlan = netdev_priv(dev);

        if (unlikely(netpoll_tx_running(dev)))
                return macvlan_netpoll_send_skb(vlan, skb);

        if (vlan->fwd_priv) {
                skb->dev = vlan->lowerdev;
                ret = dev_queue_xmit_accel(skb, vlan->fwd_priv);//硬件加速发送
        } else {
                ret = macvlan_queue_xmit(skb, dev); //没有硬件加速发送方法
        }

        if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { //统计
                struct vlan_pcpu_stats *pcpu_stats;

                pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
                u64_stats_update_begin(&pcpu_stats->syncp);
                pcpu_stats->tx_packets++;
                pcpu_stats->tx_bytes += len;
                u64_stats_update_end(&pcpu_stats->syncp);
        } else {
                this_cpu_inc(vlan->pcpu_stats->tx_dropped);
        }
        return ret;
}

macvlan_queue_xmit函数

static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{
        const struct macvlan_dev *vlan = netdev_priv(dev);
        const struct macvlan_port *port = vlan->port;
        const struct macvlan_dev *dest;

        if (vlan->mode == MACVLAN_MODE_BRIDGE) {    //macvlan如果是bridge模式
                const struct ethhdr *eth = (void *)skb->data;

                /* send to other bridge ports directly */
                if (is_multicast_ether_addr(eth->h_dest)) { //给其他设备发送广播报文,跟前面的收包对应。
                        macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE);
                        goto xmit_world;
                }

                dest = macvlan_hash_lookup(port, eth->h_dest);//查找是否为发给寄生在同一个物理网卡的其他macvlan设备
                if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {//如果是,且接受macvlan也是bridge模式,直接发送由寄生的物理网卡收包。
                        /* send to lowerdev first for its network taps */
                        dev_forward_skb(vlan->lowerdev, skb);

                        return NET_XMIT_SUCCESS;
                }
        }

xmit_world:
        skb->dev = vlan->lowerdev;
        return dev_queue_xmit(skb); //通过物理网卡发送数据包。
}

打开网络设备

static int macvlan_open(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;
        int err;

        if (macvlan_passthru(vlan->port)) {
                if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) {
                        err = dev_set_promiscuity(lowerdev, 1);
                        if (err < 0)
                                goto out;
                }
                goto hash_add;
        }

        if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD &&
            dev->rtnl_link_ops == &macvlan_link_ops) {
                vlan->fwd_priv =
                      lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev);

                /* If we get a NULL pointer back, or if we get an error
                 * then we should just fall through to the non accelerated path
                 */
                if (IS_ERR_OR_NULL(vlan->fwd_priv)) {
                        vlan->fwd_priv = NULL;
                } else
                        return 0;
        }   //如果可以硬件加速 初始化硬件加速。

        err = -EBUSY;
        if (macvlan_addr_busy(vlan->port, dev->dev_addr))
                goto out;

        err = dev_uc_add(lowerdev, dev->dev_addr);
        if (err < 0)
                goto out;
        if (dev->flags & IFF_ALLMULTI) {
                err = dev_set_allmulti(lowerdev, 1);
                if (err < 0)
                        goto del_unicast;
        }

        if (dev->flags & IFF_PROMISC) {
                err = dev_set_promiscuity(lowerdev, 1); //设置混杂模式收包
                if (err < 0)
                        goto clear_multi;
        }

hash_add:
        macvlan_hash_add(vlan); //如果不存在硬件加速将开启的vlan添加到hash表中
        return 0;

clear_multi:
        if (dev->flags & IFF_ALLMULTI)
                dev_set_allmulti(lowerdev, -1);
del_unicast:
        dev_uc_del(lowerdev, dev->dev_addr);
out:
        if (vlan->fwd_priv) {
                lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
                                                           vlan->fwd_priv);
                vlan->fwd_priv = NULL;
        }
        return err;
}

关闭

static int macvlan_stop(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;

        if (vlan->fwd_priv) {
                lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
                                                           vlan->fwd_priv); //删除硬件加速。
                vlan->fwd_priv = NULL;
                return 0;
        }

        dev_uc_unsync(lowerdev, dev);
        dev_mc_unsync(lowerdev, dev);

        if (macvlan_passthru(vlan->port)) {
                if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC))
                        dev_set_promiscuity(lowerdev, -1);
                goto hash_del;
        }

        if (dev->flags & IFF_ALLMULTI)
                dev_set_allmulti(lowerdev, -1);

        if (dev->flags & IFF_PROMISC)
                dev_set_promiscuity(lowerdev, -1); //关闭混杂模式收包

        dev_uc_del(lowerdev, dev->dev_addr); //从 hash表中删除

hash_del:
        macvlan_hash_del(vlan, !dev->dismantle);
        return 0;
}

macvlan当前有很多应用,在docker 和虚拟化中应用很多,在虚拟化中多数使用macvtap设备,在docker中使用macvlan设备。另外,vrrp等一些需要接受其他mac地址的应用也可以使用macvlan设备。

分类
未分类

世界,您好!

欢迎使用WordPress。这是您的第一篇文章。编辑或删除它,然后开始写作吧!

分类
未分类

线程通讯

线程通讯

线程与线程之间通信,除了直接访问共享变量之外,还有一种更安全的方式,那就是消息循环。

建立线程消息循环

除了每个窗口都有一个消息循环以外,每个线程也都可以有一个消息循环。但是需要注意的是,为了节约资源,线程默认情况下不会建立消息循环,因为不是所有线程都会去接收消息。但是,如果你在线程函数中调用获取消息的函数的话,系统就会自动为线程创建消息循环。

总的来说,创建一个线程的消息循环,格式大致如下:

“`C++
DWORD WINAPI 线程函数(LPVOID lpParam)
{
// 一些初始化操作

<pre><code>// 调用 PeekMessage 去让系统建立消息循环
// 因为参数的最后一个传入的是 PM_NOREMOVE,所以原来的消息依然存在消息循环中
// 这里并不去判断消息,只是为了建立消息循环
MSG msg;
PeekMessage(&msg, NULL, WM_USER, WM_USER, PM_NOREMOVE);
// 下面是消息循环主体
while (GetMessage(&msg, NULL, NULL, NULL))
{
switch (msg.message)
{
case 消息1:
消息1的处理;
break;
case 消息2:
消息2的处理;
break;
// 更多的消息处理
default:
// 和窗口消息循环不一样,线程消息循环可以没有默认消息处理
}
}
// 清理操作
return 0;
</code></pre>

}

<pre class="line-numbers prism-highlight" data-start="1"><code class="language-null"><br />&lt;p&gt;一旦你的线程函数执行在消息循环中,就会一直等待有人给这个线程发消息,直到线程收到了 &lt;code&gt;WM_QUIT&lt;/code&gt; 消息之后,线程才会退出。&lt;/p&gt;

&lt;p&gt;注意,当 &lt;code&gt;GetMessage()&lt;/code&gt; 获取到了 &lt;code&gt;WM_QUIT&lt;/code&gt; 消息的时候,就会返回 &lt;code&gt;FALSE&lt;/code&gt;。&lt;/p&gt;

&lt;h2&gt;向线程发消息&lt;/h2&gt;

&lt;p&gt;那么,怎样从另一个线程给某个指定的线程发消息呢?发送消息需要用到 &lt;code&gt;PostThreadMessage()&lt;/code&gt; 函数,这个函数的用法如下:&lt;/p&gt;

&lt;p&gt;&lt;code&gt;C++
BOOL PostThreadMessage(
DWORD idThread, // 目标线程的 ID
UINT Msg, // 要发送的消息
WPARAM wParam, // 消息的附加参数1
LPARAM IParam // 消息的附加参数2
);&lt;/code&gt;&lt;/p&gt;

&lt;p&gt;这个函数的第一个参数,目标线程的ID 怎么获取呢?当我们在创建线程的时候,&lt;code&gt;CreateThread()&lt;/code&gt; 这个函数的最后一个参数返回的就是线程的ID,所以,传入这个ID就可以了。例如:&lt;/p&gt;

&lt;p&gt;&lt;code&gt;C++
DWORD id;
CreateThread(NULL, 0, fun, NULL, NULL, &amp;id);
PostThreadMessage(id, 消息, 0, 0);&lt;/code&gt;&lt;/p&gt;

&lt;p&gt;另外需要注意的一点是,因为线程执行时机的不确定性,当你调用 &lt;code&gt;PostThreadMessage()&lt;/code&gt; 的时候目标线程的消息循环可能还没建立,这个时候这个函数就会返回 FALSE。所以,你在发送线程消息的时候,需要检查一下这个函数的返回值,如果发送失败了,就需要重新发送这个消息。例如,当你确认目标线程消息循环是可用的时候,可以这样发送消息:&lt;/p&gt;

&lt;p&gt;&lt;code&gt;C++
while (!PostThreadMessage(id, 消息, 0, 0));&lt;/code&gt;&lt;/p&gt;

&lt;h2&gt;定义消息&lt;/h2&gt;

&lt;p&gt;如果你希望让你的线程退出,只需要发送一个 &lt;code&gt;WM_QUIT&lt;/code&gt; 消息就可以了。例如:&lt;/p&gt;

&lt;p&gt;&lt;code&gt;C++
PostThreadMessage(id, WM_QUIT, 0, 0);&lt;/code&gt;&lt;/p&gt;

&lt;p&gt;你可以向线程发送Windows定义的消息,也可以发送自定义的消息。可以看到,消息的变量类型是 &lt;code&gt;UINT&lt;/code&gt;,也就是说,任何正整数都可以当成消息发送。&lt;/p&gt;

&lt;p&gt;Windows 预定义了很多消息(1024条消息以内),如果你希望向线程发消息的话,使用的数字要大于 1024。微软将 &lt;code&gt;WM_USER&lt;/code&gt; 定义成了 1024,所以,你要定义自己的消息的话,可以这么定义:&lt;/p&gt;

&lt;p&gt;“`C++

<h1>define WM_MY_MESSAGE1 (WM_USER + 1)</h1>

<h1>define WM_MY_MESSAGE2 (WM_USER + 2)</h1>


下面给出一个例子,这个例子要实现:
1. 建议一个线程,线程建立一个消息循环;
2. 主线程给新建线程发送自定义的打印Hello world消息之后,新建线程输出Hello world

“`C++

define WM_HELLO_WORLD (WM_USER + 10086)

DWORD WINAPI fun(LPVOID lpParam)
{
MSG msg;
PeekMessage(&msg, NULL, WM_USER, WM_USER, PM_NOREMOVE);

while (GetMessage(&msg, NULL, NULL, NULL))
{
    switch (msg.message)
    {
    case WM_HELLO_WORLD:
        printf("Hello wolrd!");
        break;
    }
}

return 0;

}

int main()
{
DWORD idThread;
HANDLE hThread = CreateThread(NULL, 0, fun, NULL, NULL, &idThread);

while(!PostThreadMessage(idThread, WM_HELLO_WORLD, 0, 0))
{
    Sleep(100);
}

PostThreadMessage(idThread, WM_QUIT, 0, 0);
WaitForSingleObject(hThread, INFINITE);

return 0;

}