分类
未分类

macvtap 网卡

  • macvtap设备和macvlan设备的区别
    macvtap设备是基于macvlan设备的。即每次创建一个macvtap设备必定会创建一个macvlan设备
    macvtap设备和macvlan设备最大的区别是数据包的来源和去向
    macvtap设备会关联一个字符设备,通过macvlan设备收上来的数据包直接扔到 字符设备的缓冲区里面,等待字符设备读取。字符设备发送下来的包直接通过关联的macvlan设备发送出去。
    macvtap 设备收到的数据包直接上送到用户态,用户态程序通过macvtap设备关联的字符设备直接发送数据包。
    macvlan 设备收到的数据包上送到协议栈处理。然后协议栈通过发包接口调用macvlan设备发送数据包
  • macvtap设备和tun/tap设备的区别
    macvtap设备是将macvlan网卡设备收到的包通过字符设备上送到用户态,用户态写字符设备通过macvtap设备将数据从关联的macvlan设备发送出去。
    tun/tap设备协议栈通过tun/tap设备的虚拟网卡,将发送的数据包通过字符设备上送到用户态,用户态写字符设备通过tun/tap设备的虚拟网卡,将数据包上送到协议栈。
    总结起来就是,tun/tap设备将虚拟网卡发送的数据包上送到用户态, macvtap设备将虚拟macvlan网卡接受的数据包上送到用户态。tun/tap设备将用户下发的数据包上送到协议栈,macvtap设备将用户态下发的数据包,通过虚拟macvlan设备发送出去。

私有数据结构,这个结构会被当做创建的macvlan网卡设备的私有数据。

struct macvtap_dev {
        struct macvlan_dev vlan; //macvlan设备的私有结构 上结讨论过
        struct tap_dev    tap;   //macvtap的私有结构。
};
struct tap_dev {
        struct net_device       *dev;
        u16                     flags;
        /* This array tracks active taps. */
        struct tap_queue    __rcu *taps[MAX_TAP_QUEUES];    //收包文件队列
        /* This list tracks all taps (both enabled and disabled) */
        struct list_head        queue_list;
        int                     numvtaps;
        int                     numqueues;
        netdev_features_t       tap_features;
        int                     minor;

        void (*update_features)(struct tap_dev *tap, netdev_features_t features);
        void (*count_tx_dropped)(struct tap_dev *tap);
        void (*count_rx_dropped)(struct tap_dev *tap);
};
struct tap_queue {
        struct sock sk;
        struct socket sock;
        struct socket_wq wq;
        int vnet_hdr_sz;
        struct tap_dev __rcu *tap;
        struct file *file;
        unsigned int flags;
        u16 queue_index;
        bool enabled;
        struct list_head next;
        struct skb_array skb_array; //数据包缓冲区
};

模块注册

static int macvtap_init(void)
{
        int err;

        err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap");//注册字符设备需要使用的 major 和minor
        if (err)
                goto out1;

        err = class_register(&macvtap_class);//在sys文件系统中注册macvtap class
        if (err)
                goto out2;

        err = register_netdevice_notifier(&macvtap_notifier_block);//注册网卡设备事件消息回调函数
        if (err)
                goto out3;

        err = macvlan_link_register(&macvtap_link_ops);//注册macvtap netlink 结构,供ip link 调用创建macvtap设备
        if (err)
                goto out4;

        return 0;

out4:
        unregister_netdevice_notifier(&macvtap_notifier_block);
out3:
        class_unregister(&macvtap_class);
out2:
        tap_destroy_cdev(macvtap_major, &macvtap_cdev);
out1:
        return err;
}

macvtap设备创建

static void macvtap_setup(struct net_device *dev)
{
        macvlan_common_setup(dev); //调用macvlan初始化,网卡设备
        dev->tx_queue_len = TUN_READQ_SIZE;
}
static int macvtap_newlink(struct net *src_net, struct net_device *dev,
                           struct nlattr *tb[], struct nlattr *data[],
                           struct netlink_ext_ack *extack)
{
        struct macvtap_dev *vlantap = netdev_priv(dev);
        int err;

        INIT_LIST_HEAD(&vlantap->tap.queue_list);

        /* Since macvlan supports all offloads by default, make
         * tap support all offloads also.
         */
        vlantap->tap.tap_features = TUN_OFFLOADS;

        /* Register callbacks for rx/tx drops accounting and updating
         * net_device features
         */
        vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped;
        vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped;
        vlantap->tap.update_features  = macvtap_update_features;

        err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);//macvtap设备rx_handler函数
        if (err)
                return err;

        /* Don't put anything that may fail after macvlan_common_newlink
         * because we can't undo what it does.
         */
        err = macvlan_common_newlink(src_net, dev, tb, data);//调用macvlan注册新网卡设备
        if (err) {
                netdev_rx_handler_unregister(dev);
                return err;
        }

        vlantap->tap.dev = vlantap->vlan.dev;

        return 0;
}

网卡注册时会触发 网卡注册事件,最后回调macvtap_device_event函数。

static int macvtap_device_event(struct notifier_block *unused,
                                unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct macvtap_dev *vlantap;
        struct device *classdev;
        dev_t devt;
        int err;
        char tap_name[IFNAMSIZ];

        if (dev->rtnl_link_ops != &macvtap_link_ops) //如果不是macvtap网卡设备注册不管
                return NOTIFY_DONE;

        snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
        vlantap = netdev_priv(dev);

        switch (event) {
        case NETDEV_REGISTER:
                /* Create the device node here after the network device has
                 * been registered but before register_netdevice has
                 * finished running.
                 */
                err = tap_get_minor(macvtap_major, &vlantap->tap);//获取即将创建的字符设备的minor号,并将字符设备和网卡设备给关联
                if (err)
                        return notifier_from_errno(err);

                devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
                classdev = device_create(&macvtap_class, &dev->dev, devt,
                                         dev, tap_name);//创建macvtap对应的字符设备
                if (IS_ERR(classdev)) {
                        tap_free_minor(macvtap_major, &vlantap->tap);
                        return notifier_from_errno(PTR_ERR(classdev));
                }
                err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
                                        tap_name);
                if (err)
                        return notifier_from_errno(err);
                break;
        case NETDEV_UNREGISTER:
                /* vlan->minor == 0 if NETDEV_REGISTER above failed */
                if (vlantap->tap.minor == 0)
                        break;
                sysfs_remove_link(&dev->dev.kobj, tap_name);
                devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
                device_destroy(&macvtap_class, devt);
                tap_free_minor(macvtap_major, &vlantap->tap);
                break;
        case NETDEV_CHANGE_TX_QUEUE_LEN:
                if (tap_queue_resize(&vlantap->tap))
                        return NOTIFY_BAD;
                break;
        }

        return NOTIFY_DONE;
}

打开字符设备,用于用户态通讯

static int tap_open(struct inode *inode, struct file *file)
{
        struct net *net = current->nsproxy->net_ns;
        struct tap_dev *tap;
        struct tap_queue *q;
        int err = -ENODEV;

        rtnl_lock();
        tap = dev_get_by_tap_file(imajor(inode), iminor(inode)); //通过major和minor号获取对应的tap
        if (!tap)
                goto err;

        err = -ENOMEM;
        q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
                                             &tap_proto, 0); //创建tap_queue每次打开字符设备多一个队列
        if (!q)
                goto err;

        RCU_INIT_POINTER(q->sock.wq, &q->wq);
        init_waitqueue_head(&q->wq.wait);
        q->sock.type = SOCK_RAW;
        q->sock.state = SS_CONNECTED;
        q->sock.file = file;
        q->sock.ops = &tap_socket_ops;
        sock_init_data(&q->sock, &q->sk);
        q->sk.sk_write_space = tap_sock_write_space;
        q->sk.sk_destruct = tap_sock_destruct;
        q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
        q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);

        /*
         * so far only KVM virtio_net uses tap, enable zero copy between
         * guest kernel and host kernel when lower device supports zerocopy
         *
         * The macvlan supports zerocopy iff the lower device supports zero
         * copy so we don't have to look at the lower device directly.
         */
        if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG))
                sock_set_flag(&q->sk, SOCK_ZEROCOPY);

        err = -ENOMEM;
        if (skb_array_init(&q->skb_array, tap->dev->tx_queue_len, GFP_KERNEL))
                goto err_array;

        err = tap_set_queue(tap, file, q); //将队列和tap设备关联。
        if (err)
                goto err_queue;

        dev_put(tap->dev);

        rtnl_unlock();
        return err;

err_queue:
        skb_array_cleanup(&q->skb_array);
err_array:
        sock_put(&q->sk);
err:
        if (tap)
                dev_put(tap->dev);

        rtnl_unlock();
        return err;
}

至此,数据通路已经完全打开。
接受数据流程
在netif_receive_skb函数中调用 tap_handle_frame函数

rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
{
        struct sk_buff *skb = *pskb;
        struct net_device *dev = skb->dev;
        struct tap_dev *tap;
        struct tap_queue *q;
        netdev_features_t features = TAP_FEATURES;

        tap = tap_dev_get_rcu(dev);//获取设备关联的tap结构
        if (!tap)
                return RX_HANDLER_PASS;

        q = tap_get_queue(tap, skb);//根据hash计算使用哪个缓冲队列
        if (!q)
                return RX_HANDLER_PASS;

        if (__skb_array_full(&q->skb_array))
                goto drop;

        skb_push(skb, ETH_HLEN);

        /* Apply the forward feature mask so that we perform segmentation
         * according to users wishes.  This only works if VNET_HDR is
         * enabled.
         */
        if (q->flags & IFF_VNET_HDR)
                features |= tap->tap_features;
        if (netif_needs_gso(skb, features)) {
                struct sk_buff *segs = __skb_gso_segment(skb, features, false);

                if (IS_ERR(segs))
                        goto drop;

                if (!segs) {
                        if (skb_array_produce(&q->skb_array, skb)) //将数据放入缓冲队列中
                                goto drop;
                        goto wake_up;
                }

                consume_skb(skb);
                while (segs) {
                        struct sk_buff *nskb = segs->next;

                        segs->next = NULL;
                        if (skb_array_produce(&q->skb_array, segs)) {
                                kfree_skb(segs);
                                kfree_skb_list(nskb);
                                break;
                        }
                        segs = nskb;
                }
        } else {
                /* If we receive a partial checksum and the tap side
                 * doesn't support checksum offload, compute the checksum.
                 * Note: it doesn't matter which checksum feature to
                 *        check, we either support them all or none.
                 */
                if (skb->ip_summed == CHECKSUM_PARTIAL &&
                    !(features & NETIF_F_CSUM_MASK) &&
                    skb_checksum_help(skb))
                        goto drop;
                if (skb_array_produce(&q->skb_array, skb)) //将数据放入缓冲队列中
                        goto drop;
        }

wake_up:
        wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND); //唤醒等待读数据的进程
        return RX_HANDLER_CONSUMED;

drop:
        /* Count errors/drops only here, thus don't care about args. */
        if (tap->count_rx_dropped)
                tap->count_rx_dropped(tap);
        kfree_skb(skb);
        return RX_HANDLER_CONSUMED;
}

用户态调用tap_read_iter读取数据,tap_do_read 函数跟 tun/tap中函数类似,都是判断缓冲区是否有数据,如果没有数据就等待,如果有数据返回数据

static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct tap_queue *q = file->private_data;
        ssize_t len = iov_iter_count(to), ret;

        ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK, NULL); //从缓冲区获取数据
        ret = min_t(ssize_t, ret, len);
        if (ret > 0)
                iocb->ki_pos = ret;
        return ret;
}

发送数据流程

static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct tap_queue *q = file->private_data;

        return tap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK); 发送数据
}
static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
                            struct iov_iter *from, int noblock)
{
        int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
        struct sk_buff *skb;
        struct tap_dev *tap;
        unsigned long total_len = iov_iter_count(from);
        unsigned long len = total_len;
        int err;
        struct virtio_net_hdr vnet_hdr = { 0 };
        int vnet_hdr_len = 0;
        int copylen = 0;
        int depth;
        bool zerocopy = false;
        size_t linear;

        if (q->flags & IFF_VNET_HDR) {
                vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);

                err = -EINVAL;
                if (len < vnet_hdr_len)
                        goto err;
                len -= vnet_hdr_len;

                err = -EFAULT;
                if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
                        goto err;
                iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
                if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
                     tap16_to_cpu(q, vnet_hdr.csum_start) +
                     tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
                             tap16_to_cpu(q, vnet_hdr.hdr_len))
                        vnet_hdr.hdr_len = cpu_to_tap16(q,
                                 tap16_to_cpu(q, vnet_hdr.csum_start) +
                                 tap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
                err = -EINVAL;
                if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len)
                        goto err;
        }

        err = -EINVAL;
        if (unlikely(len < ETH_HLEN))
                goto err;

        if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
                struct iov_iter i;

                copylen = vnet_hdr.hdr_len ?
                        tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
                if (copylen > good_linear)
                        copylen = good_linear;
                else if (copylen < ETH_HLEN)
                        copylen = ETH_HLEN;
                linear = copylen;
                i = *from;
                iov_iter_advance(&i, copylen);
                if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
                        zerocopy = true;
        }

        if (!zerocopy) {
                copylen = len;
                linear = tap16_to_cpu(q, vnet_hdr.hdr_len);
                if (linear > good_linear)
                        linear = good_linear;
                else if (linear < ETH_HLEN)
                        linear = ETH_HLEN;
        }

        skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen,
                            linear, noblock, &err);//分配skb
        if (!skb)
                goto err;

        if (zerocopy)
                err = zerocopy_sg_from_iter(skb, from);
        else
                err = skb_copy_datagram_from_iter(skb, 0, from, len); //获取数据

        if (err)
                goto err_kfree;

        skb_set_network_header(skb, ETH_HLEN);
        skb_reset_mac_header(skb); //设置
        skb->protocol = eth_hdr(skb)->h_proto;

        if (vnet_hdr_len) {
                err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
                                            tap_is_little_endian(q));
                if (err)
                        goto err_kfree;
        }

        skb_probe_transport_header(skb, ETH_HLEN);

        /* Move network header to the right position for VLAN tagged packets */
        if ((skb->protocol == htons(ETH_P_8021Q) ||
             skb->protocol == htons(ETH_P_8021AD)) &&
            __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
                skb_set_network_header(skb, depth);

        rcu_read_lock();
        tap = rcu_dereference(q->tap);
        /* copy skb_ubuf_info for callback when skb has no error */
        if (zerocopy) {
                skb_shinfo(skb)->destructor_arg = m->msg_control;
                skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
                skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
        } else if (m && m->msg_control) {
                struct ubuf_info *uarg = m->msg_control;
                uarg->callback(uarg, false);
        }

        if (tap) {
                skb->dev = tap->dev; //设置发送使用的网卡
                dev_queue_xmit(skb); //发送数据包,最后调用macvlan的发送接口发送数据包
        } else {
                kfree_skb(skb);
        }
        rcu_read_unlock();

        return total_len;

err_kfree:
        kfree_skb(skb);

err:
        rcu_read_lock();
        tap = rcu_dereference(q->tap);
        if (tap && tap->count_tx_dropped)
                tap->count_tx_dropped(tap);
        rcu_read_unlock();

        return err;
}

至此, macvtap数据发送流程结束。

macvtap在vpn和 虚拟化中应用很广。它主要功能是将macvlan设备获取的数据包直接上送到用户态而不是协议栈。

发表评论

电子邮件地址不会被公开。 必填项已用*标注