分类
linux virtual nic 未分类

macvlan 虚拟网卡

macvlan 虚拟网卡设备
macvlan虚拟网卡设备时寄生在物理网卡设备上的。发包时调用自己的发包函数,查找到寄生的物理设备,然后通过物理设备发包。收包时,通过注册寄生的物理设备的rx_handler回调函数,处理数据包。
macvlan 虚拟网卡设备包括5种模式
private 模式:在这种模式下,macvlan设备不能接受寄生在同一个物理网卡的其他macvlan设备的数据包,即使是其他macvlan设备通过物理网卡发送出去并通过hairpin设备返回的包
vepa 模式:在这种模式下,macvlan设备不能直接接受寄生在同一个物理网卡的其他macvlan设备的数据包,但是其他macvlan设备可以将数据包通过物理网卡发送出去,然后通过hairpin设备返回的给其他macvlan设备
passthru 模式:在这种模式下,每一个物理设备只能寄生一个macvlan设备
bridge 模式:在这种模式下,寄生在同一个物理设备的macvlan设备可以直接通讯,不需要外接的hairpin设备帮助
source 模式: 在这种模式下,寄生在物理设备的这类macvlan设备,只能接受指定的源 mac source的数据包,其他数据包都不接受。

macvlan设备 关键数据结构
macvlan_port ,这个数据时在注册rx_handler时使用,作为回调函数的参数。

struct macvlan_port {
        struct net_device       *dev;                           //物理设备
        struct hlist_head       vlan_hash[MACVLAN_HASH_SIZE];   //macvlan设备私有数据
        struct list_head        vlans;                          //macvlan设备私有数据
        struct sk_buff_head     bc_queue;                       //广播报文队列
        struct work_struct      bc_work;                        //发送广播报文进程
        u32                     flags;                          //标志
        int                     count;                          //macvlan设备数量
        struct hlist_head       vlan_source_hash[MACVLAN_HASH_SIZE];    //mac vlan source类型设备专用
        DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
        unsigned char           perm_addr[ETH_ALEN];
};

macvlan_dev ,这个数据结构,时macvlan网卡的私有数据结构,每创建一个macvlan设备就会创建一个设备,并将这个数据结构挂在 macvlan_port 数据结构上。

struct macvlan_dev {
        struct net_device       *dev;           //macvlan网卡设备
        struct list_head        list;           //寄生的macvlan链表
        struct hlist_node       hlist;          //寄生的macvlanhash表
        struct macvlan_port     *port;          //macvlan_port
        struct net_device       *lowerdev;      //寄生的物理设备
        void                    *fwd_priv;      //如果物理网卡支持可以硬件加速
        struct vlan_pcpu_stats __percpu *pcpu_stats;

        DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);

        netdev_features_t       set_features;
        enum macvlan_mode       mode;
        u16                     flags;
        /* This array tracks active taps. */
        struct tap_queue        __rcu *taps[MAX_TAP_QUEUES];
        /* This list tracks all taps (both enabled and disabled) */
        struct list_head        queue_list;
        int                     numvtaps;
        int                     numqueues;
        netdev_features_t       tap_features;
        int                     minor;
        int                     nest_level;
#ifdef CONFIG_NET_POLL_CONTROLLER
        struct netpoll          *netpoll;
#endif
        unsigned int            macaddr_count;
};

两个私有结构之间的关系

digraph tun{
    node [shape = plaintext]
    rankdir = LR
    macvlan_port[label=<
            <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
            <TR><TD BORDER="0" ALIGN="CENTER">struct macvlan_port</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f0">struct net_device *dev</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f1">struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]</TD></TR>
            <TR><TD ALIGN="CENTER">struct list_head vlans</TD></TR>
            <TR><TD ALIGN="LEFT">struct sk_buff_head bc_queue</TD></TR>
            <TR><TD ALIGN="CENTER">struct work_struct bc_work</TD></TR>
            <TR><TD ALIGN="LEFT">u32 flags</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f2">struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            </TABLE>>]
    macvlan_dev[label=<
            <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
            <TR><TD BORDER="0" ALIGN="CENTER">struct macvlan_dev</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f0">strut net_device *dev</TD></TR>
            <TR><TD ALIGN="LEFT">struct list_head list</TD></TR>
            <TR><TD ALIGN="LEFT">struct hlist_node hlist</TD></TR>
            <TR><TD ALIGN="LEFT">struct macvlan_port *port</TD></TR>
            <TR><TD ALIGN="LEFT">struct net_device *lowerdev</TD></TR>
            <TR><TD ALIGN="LEFT">void *fwd_priv</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            </TABLE>>]
    macvlan_port:f1 -> macvlan_dev:f0
    macvlan_port:f2 -> macvlan_dev:f0
}

模块注册

static int __init macvlan_init_module(void)
{
        int err;

        register_netdevice_notifier(&macvlan_notifier_block);   //注册网卡通知

        err = macvlan_link_register(&macvlan_link_ops);     //注册netlink方法
        if (err < 0)
                goto err1;
        return 0;
err1:
        unregister_netdevice_notifier(&macvlan_notifier_block);
        return err;
}

创建macvlan设备

int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
                           struct nlattr *tb[], struct nlattr *data[])
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct macvlan_port *port;
        struct net_device *lowerdev;
        int err;
        int macmode;
        bool create = false;

        if (!tb[IFLA_LINK])
                return -EINVAL;

        lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
        if (lowerdev == NULL)
                return -ENODEV;

        /* When creating macvlans or macvtaps on top of other macvlans - use
         * the real device as the lowerdev.
         */
        if (netif_is_macvlan(lowerdev))
                lowerdev = macvlan_dev_real_dev(lowerdev);  //获取寄生的物理设备

        if (!tb[IFLA_MTU])
                dev->mtu = lowerdev->mtu;
        else if (dev->mtu > lowerdev->mtu)
                return -EINVAL;

        /* MTU range: 68 - lowerdev->max_mtu */
        dev->min_mtu = ETH_MIN_MTU;
        dev->max_mtu = lowerdev->max_mtu;

        if (!tb[IFLA_ADDRESS])
                eth_hw_addr_random(dev);

        if (!macvlan_port_exists(lowerdev)) {
                err = macvlan_port_create(lowerdev);    //如果物理设备还有寄生的macvlan设备创建macvlan_port结构,并注册rx_handler
                if (err < 0)
                        return err;
                create = true;
        }
        port = macvlan_port_get_rtnl(lowerdev);

        /* Only 1 macvlan device can be created in passthru mode */
        if (macvlan_passthru(port)) {   //判断是否有 passthru类型macvlan,如果有退出
                /* The macvlan port must be not created this time,
                 * still goto destroy_macvlan_port for readability.
                 */
                err = -EINVAL;
                goto destroy_macvlan_port;
        }

        vlan->lowerdev = lowerdev;
        vlan->dev      = dev;
        vlan->port     = port;
        vlan->set_features = MACVLAN_FEATURES;
        vlan->nest_level = dev_get_nest_level(lowerdev) + 1;

        vlan->mode     = MACVLAN_MODE_VEPA;
        if (data && data[IFLA_MACVLAN_MODE])
                vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);

        if (data && data[IFLA_MACVLAN_FLAGS])
                vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);

        if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
                if (port->count) {          //如果新的macvlan设备时prassthru类型,且被寄生的物理设备上已经存在macvlan设备,直接退出失败
                        err = -EINVAL;
                        goto destroy_macvlan_port;
                }
                macvlan_set_passthru(port);
                eth_hw_addr_inherit(dev, lowerdev);
        }

        if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
                if (vlan->mode != MACVLAN_MODE_SOURCE) {
                        err = -EINVAL;
                        goto destroy_macvlan_port;
                }
                macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]);
                err = macvlan_changelink_sources(vlan, macmode, data);  //如果设备时 source类型的设备,添加想收的源mac地址,其他地址。这种类型的macvlan设备只收指定源mac包
                if (err)
                        goto destroy_macvlan_port;
        }

        err = register_netdevice(dev);
        if (err < 0)
                goto destroy_macvlan_port;

        dev->priv_flags |= IFF_MACVLAN;
        err = netdev_upper_dev_link(lowerdev, dev);
        if (err)
                goto unregister_netdev;

        list_add_tail_rcu(&vlan->list, &port->vlans);   //将macvlan设备添加到列表中
        netif_stacked_transfer_operstate(lowerdev, dev);
        linkwatch_fire_event(dev);

        return 0;

unregister_netdev:
        unregister_netdevice(dev);
destroy_macvlan_port:
        if (create)
                macvlan_port_destroy(port->dev);
        return err;
}

收包逻辑

static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
{
        struct macvlan_port *port;
        struct sk_buff *skb = *pskb;
        const struct ethhdr *eth = eth_hdr(skb);
        const struct macvlan_dev *vlan;
        const struct macvlan_dev *src;
        struct net_device *dev;
        unsigned int len = 0;
        int ret;
        rx_handler_result_t handle_res;

        port = macvlan_port_get_rcu(skb->dev);
        if (is_multicast_ether_addr(eth->h_dest)) { //广播报文处理逻辑
                unsigned int hash;

                skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN);
                if (!skb)
                        return RX_HANDLER_CONSUMED;
                *pskb = skb;
                eth = eth_hdr(skb);
                macvlan_forward_source(skb, port, eth->h_source);//source 类型macvlan设备收包函数,
                src = macvlan_hash_lookup(port, eth->h_source);//查找发包设备是不是寄生在同一个物理网卡上的macvlan设备
                if (src && src->mode != MACVLAN_MODE_VEPA &&
                    src->mode != MACVLAN_MODE_BRIDGE) { //如果发包设备是寄生在同一个物理网卡上的macvlan设备,且设备类型是 source private passthru类型.则给自己发送广播报文
                        /* forward to original port. */
                        vlan = src;
                        ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?:
                              netif_rx(skb);
                        handle_res = RX_HANDLER_CONSUMED;
                        goto out;
                }

                hash = mc_hash(NULL, eth->h_dest);
                if (test_bit(hash, port->mc_filter))
                        macvlan_broadcast_enqueue(port, src, skb);  //添加到发送队列中,通过工作队列发送广播

                return RX_HANDLER_PASS; //给物理网卡上送广播报文
        }

        macvlan_forward_source(skb, port, eth->h_source); //单播报文source类型收包
        if (macvlan_passthru(port))
                vlan = list_first_or_null_rcu(&port->vlans,
                                              struct macvlan_dev, list);
        else
                vlan = macvlan_hash_lookup(port, eth->h_dest); //查找目的地址对应的macvlan设备
        if (vlan == NULL)
                return RX_HANDLER_PASS; //找不到,继续走物理网卡逻辑

        dev = vlan->dev;
        if (unlikely(!(dev->flags & IFF_UP))) {
                kfree_skb(skb);
                return RX_HANDLER_CONSUMED; //找到了网卡没启动,释放skb
        }
        len = skb->len + ETH_HLEN;
        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb) {
                ret = NET_RX_DROP;
                handle_res = RX_HANDLER_CONSUMED;
                goto out;
        }

        *pskb = skb;
        skb->dev = dev;
        skb->pkt_type = PACKET_HOST; //找到了并且网卡没有问题,设置为本机

        ret = NET_RX_SUCCESS;
        handle_res = RX_HANDLER_ANOTHER;    //走macvlan 网卡逻辑
out:
        macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false);
        return handle_res;
}

发送工作队列,工作函数

static void macvlan_process_broadcast(struct work_struct *w)
{       
        struct macvlan_port *port = container_of(w, struct macvlan_port,
                                                 bc_work);
        struct sk_buff *skb;
        struct sk_buff_head list;

        __skb_queue_head_init(&list);

        spin_lock_bh(&port->bc_queue.lock);
        skb_queue_splice_tail_init(&port->bc_queue, &list);
        spin_unlock_bh(&port->bc_queue.lock);

        while ((skb = __skb_dequeue(&list))) {
                const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src;

                rcu_read_lock();

                if (!src) //如果不是寄生在同一个 物理设备上的macvlan发送,则所有macvlan都 收包
                        /* frame comes from an external address */
                        macvlan_broadcast(skb, port, NULL,
                                          MACVLAN_MODE_PRIVATE |
                                          MACVLAN_MODE_VEPA    |
                                          MACVLAN_MODE_PASSTHRU|
                                          MACVLAN_MODE_BRIDGE);
                else if (src->mode == MACVLAN_MODE_VEPA)//如果是寄生的同一个物理设备上的VEPA类型macvlan发送,则VEPA类型和BRIDGE类型macvlan都受到,private不收
                        /* flood to everyone except source */
                        macvlan_broadcast(skb, port, src->dev,
                                          MACVLAN_MODE_VEPA |
                                          MACVLAN_MODE_BRIDGE);
                else    //如果是寄生在同一个物理设备上的BRIDGE类型macvlan发送,则VEPA类型macvlan受到,private 和 bridge类型不收, bridge不收是因为在发送的时候已经上送给它了。
                        /*
                         * flood only to VEPA ports, bridge ports
                         * already saw the frame on the way out.
                         */
                        macvlan_broadcast(skb, port, src->dev,
                                          MACVLAN_MODE_VEPA);

                rcu_read_unlock();

                if (src)
                        dev_put(src->dev);
                kfree_skb(skb);
        }
}

通过控制广播报文的收发,macvlan保证寄生在同一个物理设备的private设备不能相互收到包。arp协议基于广播报文。

发包流程

static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
                                      struct net_device *dev)
{                       
        unsigned int len = skb->len;
        int ret;
        struct macvlan_dev *vlan = netdev_priv(dev);

        if (unlikely(netpoll_tx_running(dev)))
                return macvlan_netpoll_send_skb(vlan, skb);

        if (vlan->fwd_priv) {
                skb->dev = vlan->lowerdev;
                ret = dev_queue_xmit_accel(skb, vlan->fwd_priv);//硬件加速发送
        } else {
                ret = macvlan_queue_xmit(skb, dev); //没有硬件加速发送方法
        }

        if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { //统计
                struct vlan_pcpu_stats *pcpu_stats;

                pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
                u64_stats_update_begin(&pcpu_stats->syncp);
                pcpu_stats->tx_packets++;
                pcpu_stats->tx_bytes += len;
                u64_stats_update_end(&pcpu_stats->syncp);
        } else {
                this_cpu_inc(vlan->pcpu_stats->tx_dropped);
        }
        return ret;
}

macvlan_queue_xmit函数

static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{
        const struct macvlan_dev *vlan = netdev_priv(dev);
        const struct macvlan_port *port = vlan->port;
        const struct macvlan_dev *dest;

        if (vlan->mode == MACVLAN_MODE_BRIDGE) {    //macvlan如果是bridge模式
                const struct ethhdr *eth = (void *)skb->data;

                /* send to other bridge ports directly */
                if (is_multicast_ether_addr(eth->h_dest)) { //给其他设备发送广播报文,跟前面的收包对应。
                        macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE);
                        goto xmit_world;
                }

                dest = macvlan_hash_lookup(port, eth->h_dest);//查找是否为发给寄生在同一个物理网卡的其他macvlan设备
                if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {//如果是,且接受macvlan也是bridge模式,直接发送由寄生的物理网卡收包。
                        /* send to lowerdev first for its network taps */
                        dev_forward_skb(vlan->lowerdev, skb);

                        return NET_XMIT_SUCCESS;
                }
        }

xmit_world:
        skb->dev = vlan->lowerdev;
        return dev_queue_xmit(skb); //通过物理网卡发送数据包。
}

打开网络设备

static int macvlan_open(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;
        int err;

        if (macvlan_passthru(vlan->port)) {
                if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) {
                        err = dev_set_promiscuity(lowerdev, 1);
                        if (err < 0)
                                goto out;
                }
                goto hash_add;
        }

        if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD &&
            dev->rtnl_link_ops == &macvlan_link_ops) {
                vlan->fwd_priv =
                      lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev);

                /* If we get a NULL pointer back, or if we get an error
                 * then we should just fall through to the non accelerated path
                 */
                if (IS_ERR_OR_NULL(vlan->fwd_priv)) {
                        vlan->fwd_priv = NULL;
                } else
                        return 0;
        }   //如果可以硬件加速 初始化硬件加速。

        err = -EBUSY;
        if (macvlan_addr_busy(vlan->port, dev->dev_addr))
                goto out;

        err = dev_uc_add(lowerdev, dev->dev_addr);
        if (err < 0)
                goto out;
        if (dev->flags & IFF_ALLMULTI) {
                err = dev_set_allmulti(lowerdev, 1);
                if (err < 0)
                        goto del_unicast;
        }

        if (dev->flags & IFF_PROMISC) {
                err = dev_set_promiscuity(lowerdev, 1); //设置混杂模式收包
                if (err < 0)
                        goto clear_multi;
        }

hash_add:
        macvlan_hash_add(vlan); //如果不存在硬件加速将开启的vlan添加到hash表中
        return 0;

clear_multi:
        if (dev->flags & IFF_ALLMULTI)
                dev_set_allmulti(lowerdev, -1);
del_unicast:
        dev_uc_del(lowerdev, dev->dev_addr);
out:
        if (vlan->fwd_priv) {
                lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
                                                           vlan->fwd_priv);
                vlan->fwd_priv = NULL;
        }
        return err;
}

关闭

static int macvlan_stop(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;

        if (vlan->fwd_priv) {
                lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
                                                           vlan->fwd_priv); //删除硬件加速。
                vlan->fwd_priv = NULL;
                return 0;
        }

        dev_uc_unsync(lowerdev, dev);
        dev_mc_unsync(lowerdev, dev);

        if (macvlan_passthru(vlan->port)) {
                if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC))
                        dev_set_promiscuity(lowerdev, -1);
                goto hash_del;
        }

        if (dev->flags & IFF_ALLMULTI)
                dev_set_allmulti(lowerdev, -1);

        if (dev->flags & IFF_PROMISC)
                dev_set_promiscuity(lowerdev, -1); //关闭混杂模式收包

        dev_uc_del(lowerdev, dev->dev_addr); //从 hash表中删除

hash_del:
        macvlan_hash_del(vlan, !dev->dismantle);
        return 0;
}

macvlan当前有很多应用,在docker 和虚拟化中应用很多,在虚拟化中多数使用macvtap设备,在docker中使用macvlan设备。另外,vrrp等一些需要接受其他mac地址的应用也可以使用macvlan设备。

发表评论

电子邮件地址不会被公开。 必填项已用*标注