分类
未分类

ipvlan 虚拟 网卡

ipvlan虚拟网卡类似于macvlan虚拟网卡。只是macvlan虚拟网卡 每个虚拟网卡都有自己的mac地址,而 ipvlan虚拟网卡所有的网卡共用一个mac地址,通过不同的ip地址来区分数据包属于哪个网卡。有点类似于内核之前的别名接口。但是它跟别名接口最大的区别在于由于存在虚拟网卡设备,可以将不同的虚拟网卡分配到不同的命令空间,后续我们可以看到它还能够跟tap设备结合,组成 ipvtap虚拟网卡。
类似于 macvlan设备,ipvlan设备也有3中模式
L2模式,在这种模式下,ipvlan虚拟网卡能够收到广播报文,能够自己处理arp请求
L3模式,在这种模式下,ipvlan虚拟网卡不能够接受二层广播报文,arp请求由主网卡代为处理
L3S模式,在这种模式下,ipvlan虚拟网卡不能够接受二层广播报文,arp请求由主网卡代为处理。跟L3模式唯一的区别在于,L3s模式只有在报文是发往本地的时候才修改接收数据包的网卡,否则不修改,这种模式没法和tap设备结合

ipvlan设备的关键数据结构是ipvl_port结构,每个被寄生的物理设备都会有一个这样的设备,所有寄生设备都会被连接到这个结构中

struct ipvl_port {
        struct net_device       *dev;   //寄生的物理网卡
        possible_net_t          pnet;   //命令空间
        struct hlist_head       hlhead[IPVLAN_HASH_SIZE];   //根据地址查找ipvlan设备
        struct list_head        ipvlans;    //所有的ipvlan设备
        u16                     mode;
        u16                     dev_id_start;
        struct work_struct      wq;     //广播地址发送工作进程
        struct sk_buff_head     backlog;    //广播包缓冲队列
        int                     count;  //寄生的ipvlan设备数量
        struct ida              ida;
};

每个ipvlan设备私有数据结构

struct ipvl_dev {
        struct net_device       *dev;   //ipvlan设备网卡
        struct list_head        pnode;  //链表头
        struct ipvl_port        *port;  //属于哪个port
        struct net_device       *phy_dev;   //寄生的物理网卡设备
        struct list_head        addrs;  //这个ipvlan设备对应的ip地址列表
        struct ipvl_pcpu_stats  __percpu *pcpu_stats;   //统计技术
        DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE);
        netdev_features_t       sfeatures;
        u32                     msg_enable;
};

模块注册

static int __init ipvlan_init_module(void)
{
        int err;

        ipvlan_init_secret();
        register_netdevice_notifier(&ipvlan_notifier_block);    //注册网卡启停事件
        register_inet6addr_notifier(&ipvlan_addr6_notifier_block);  //注册ipv6地址加减事件
        register_inet6addr_validator_notifier(
            &ipvlan_addr6_vtor_notifier_block); //注册ipv6地址验证事件
        register_inetaddr_notifier(&ipvlan_addr4_notifier_block);   //注册ipv4地址加减事件
        register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block);    //注册ipv4地址验证事件

        err = register_pernet_subsys(&ipvlan_net_ops);  //注册网络命名空间退出和初始化结构
        if (err < 0)
                goto error;

        err = ipvlan_link_register(&ipvlan_link_ops);   //注册netlink创建网卡结构
        if (err < 0) {
                unregister_pernet_subsys(&ipvlan_net_ops);
                goto error;
        }

        return 0;
error:
        unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
        unregister_inetaddr_validator_notifier(
            &ipvlan_addr4_vtor_notifier_block);
        unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
        unregister_inet6addr_validator_notifier(
            &ipvlan_addr6_vtor_notifier_block);
        unregister_netdevice_notifier(&ipvlan_notifier_block);
        return err;
}

由于ipvlan需要根据ip地址分流,因此当虚拟网卡添加或者删除ip时,都需要在分流hash表中做相应的操作。因此需要注册 ip地址变化事件处理函数

int ipvlan_link_new(struct net *src_net, struct net_device *dev,
                    struct nlattr *tb[], struct nlattr *data[],
                    struct netlink_ext_ack *extack)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ipvl_port *port;
        struct net_device *phy_dev;
        int err;
        u16 mode = IPVLAN_MODE_L3;
        bool create = false;

        if (!tb[IFLA_LINK])
                return -EINVAL;

        phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));  //获取寄生的物理设备
        if (!phy_dev)
                return -ENODEV;

        if (netif_is_ipvlan(phy_dev)) { //如果嵌套获取最底层的物理设备
                struct ipvl_dev *tmp = netdev_priv(phy_dev);

                phy_dev = tmp->phy_dev;
        } else if (!netif_is_ipvlan_port(phy_dev)) {//如果物理设备还没被寄生过创建port
                err = ipvlan_port_create(phy_dev);
                if (err < 0)
                        return err;
                create = true;
        }

        if (data && data[IFLA_IPVLAN_MODE])
                mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); //获取创建的ipvlan设备类型

        port = ipvlan_port_get_rtnl(phy_dev);
        ipvlan->phy_dev = phy_dev;
        ipvlan->dev = dev;
        ipvlan->port = port;
        ipvlan->sfeatures = IPVLAN_FEATURES;
        ipvlan_adjust_mtu(ipvlan, phy_dev);
        INIT_LIST_HEAD(&ipvlan->addrs);

        /* If the port-id base is at the MAX value, then wrap it around and
         * begin from 0x1 again. This may be due to a busy system where lots
         * of slaves are getting created and deleted.
         */
        if (port->dev_id_start == 0xFFFE)
                port->dev_id_start = 0x1;

        /* Since L2 address is shared among all IPvlan slaves including
         * master, use unique 16 bit dev-ids to diffentiate among them.
         * Assign IDs between 0x1 and 0xFFFE (used by the master) to each
         * slave link [see addrconf_ifid_eui48()].
         */
        err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE,
                             GFP_KERNEL); 
        if (err < 0)
                err = ida_simple_get(&port->ida, 0x1, port->dev_id_start,
                                     GFP_KERNEL);
        if (err < 0)
                goto destroy_ipvlan_port;
        dev->dev_id = err;
        /* Increment id-base to the next slot for the future assignment */
        port->dev_id_start = err + 1;

        /* TODO Probably put random address here to be presented to the
         * world but keep using the physical-dev address for the outgoing
         * packets.
         */
        memcpy(dev->dev_addr, phy_dev->dev_addr, ETH_ALEN); //设置mac地址

        dev->priv_flags |= IFF_IPVLAN_SLAVE;

        err = register_netdevice(dev);
        if (err < 0)
                goto remove_ida;

        err = netdev_upper_dev_link(phy_dev, dev);
        if (err) {
                goto unregister_netdev;
        }
        err = ipvlan_set_port_mode(port, mode); //设置ipvlan模式,所有寄居在同一个物理设备上的ipvlan设备必须是同一个模式
        if (err) {
                goto unlink_netdev;
        }

        list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);//添加ipvlan到port中
        netif_stacked_transfer_operstate(phy_dev, dev);
        return 0;

unlink_netdev:
        netdev_upper_dev_unlink(phy_dev, dev);
unregister_netdev:
        unregister_netdevice(dev);
remove_ida:
        ida_simple_remove(&port->ida, dev->dev_id);
destroy_ipvlan_port:
        if (create)
                ipvlan_port_destroy(phy_dev);
        return err;
}

打开设备 函数

static int ipvlan_open(struct net_device *dev)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct net_device *phy_dev = ipvlan->phy_dev;
        struct ipvl_addr *addr;

        if (ipvlan->port->mode == IPVLAN_MODE_L3 ||
            ipvlan->port->mode == IPVLAN_MODE_L3S)
                dev->flags |= IFF_NOARP;
        else
                dev->flags &= ~IFF_NOARP;   //只L2模式下 需要回复 arp报文

        list_for_each_entry(addr, &ipvlan->addrs, anode)
                ipvlan_ht_addr_add(ipvlan, addr);   //将这个设备关联的ip地址添加到查找hash表中。

        return dev_uc_add(phy_dev, phy_dev->dev_addr);
}

数据流 接受数据包

rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
{       
        struct sk_buff *skb = *pskb;
        struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev);

        if (!port)
                return RX_HANDLER_PASS;

        switch (port->mode) {
        case IPVLAN_MODE_L2:
                return ipvlan_handle_mode_l2(pskb, port);   //L2 模式收包函数
        case IPVLAN_MODE_L3:
                return ipvlan_handle_mode_l3(pskb, port);   //L3模式收包函数
        case IPVLAN_MODE_L3S:
                return RX_HANDLER_PASS;     //L3s模式,在走协议栈时不改变dev,只有在最后local_in链表中改变
        }

        /* Should not reach here */
        WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n",
                          port->mode);
        kfree_skb(skb);
        return RX_HANDLER_CONSUMED;
} 
static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb,
                                                 struct ipvl_port *port)
{               
        struct sk_buff *skb = *pskb;
        struct ethhdr *eth = eth_hdr(skb);
        rx_handler_result_t ret = RX_HANDLER_PASS;
        void *lyr3h;
        int addr_type;

        if (is_multicast_ether_addr(eth->h_dest)) {     //广播地址处理流程
                if (ipvlan_external_frame(skb, port)) {
                        struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

                        /* External frames are queued for device local
                         * distribution, but a copy is given to master
                         * straight away to avoid sending duplicates later
                         * when work-queue processes this frame. This is
                         * achieved by returning RX_HANDLER_PASS.
                         */
                        if (nskb) {
                                ipvlan_skb_crossing_ns(nskb, NULL);
                                ipvlan_multicast_enqueue(port, nskb, false);//添加到广播数据包队里中,等待广播地址处理工作队列处理。
                        }
                }
        } else {
                struct ipvl_addr *addr;

                lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
                if (!lyr3h)
                        return ret;

                addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);    //在ip地址hash表中通过地址找到收包设备,
                if (addr)
                        ret = ipvlan_rcv_frame(addr, pskb, false);  //修改数据包的收包设备为找到的ipvlan设备 返回RX_HANDLER_ANOTHER
        }

        return ret;
}
static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb,
                                                 struct ipvl_port *port)
{       
        void *lyr3h;
        int addr_type;
        struct ipvl_addr *addr;
        struct sk_buff *skb = *pskb;
        rx_handler_result_t ret = RX_HANDLER_PASS;

        lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
        if (!lyr3h)
                goto out;

        addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);    //通过地址找到收包设备
        if (addr)
                ret = ipvlan_rcv_frame(addr, pskb, false);  //改变数据包的收包设备,然后返回RX_HANDLER_ANOTHER

out:    
        return ret;
}

数据流 发包流程

static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb,
                                     struct net_device *dev)
{
        const struct ipvl_dev *ipvlan = netdev_priv(dev);
        int skblen = skb->len;
        int ret;

        ret = ipvlan_queue_xmit(skb, dev);  //发送数据包
        if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
                struct ipvl_pcpu_stats *pcptr;

                pcptr = this_cpu_ptr(ipvlan->pcpu_stats);

                u64_stats_update_begin(&pcptr->syncp);
                pcptr->tx_pkts++;
                pcptr->tx_bytes += skblen;
                u64_stats_update_end(&pcptr->syncp);
        } else {
                this_cpu_inc(ipvlan->pcpu_stats->tx_drps);
        }
        return ret;
}
int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{                                    
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ipvl_port *port = ipvlan_port_get_rcu_bh(ipvlan->phy_dev);

        if (!port)
                goto out;

        if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
                goto out;

        switch(port->mode) {
        case IPVLAN_MODE_L2:
                return ipvlan_xmit_mode_l2(skb, dev);   //L2模式发包流程
        case IPVLAN_MODE_L3:
        case IPVLAN_MODE_L3S:
                return ipvlan_xmit_mode_l3(skb, dev);   //L3模式发包流程
        } 

        /* Should not reach here */
        WARN_ONCE(true, "ipvlan_queue_xmit() called for mode = [%hx]\n",
                          port->mode);
out:
        kfree_skb(skb);
        return NET_XMIT_DROP;                
}
static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
{
        const struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ethhdr *eth = eth_hdr(skb);
        struct ipvl_addr *addr;
        void *lyr3h;
        int addr_type;

        if (ether_addr_equal(eth->h_dest, eth->h_source)) { //发送给自己的数据包
                lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
                if (lyr3h) {
                        addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
                        if (addr)
                                return ipvlan_rcv_frame(addr, &skb, true);  //直接通过ipvlan设备上送
                }
                skb = skb_share_check(skb, GFP_ATOMIC);
                if (!skb)
                        return NET_XMIT_DROP;

                /* Packet definitely does not belong to any of the
                 * virtual devices, but the dest is local. So forward
                 * the skb for the main-dev. At the RX side we just return
                 * RX_PASS for it to be processed further on the stack.
                 */
                return dev_forward_skb(ipvlan->phy_dev, skb);   //通过物理设备上送

        } else if (is_multicast_ether_addr(eth->h_dest)) {  //广播报文
                ipvlan_skb_crossing_ns(skb, NULL);
                ipvlan_multicast_enqueue(ipvlan->port, skb, true);  //添加到广播报文缓冲队列中
                return NET_XMIT_SUCCESS;
        }

        ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
        return dev_queue_xmit(skb); //通过物理设备发送出去
}
static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
{       
        const struct ipvl_dev *ipvlan = netdev_priv(dev);
        void *lyr3h;
        struct ipvl_addr *addr;
        int addr_type;

        lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
        if (!lyr3h)
                goto out;

        addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
        if (addr)
                return ipvlan_rcv_frame(addr, &skb, true);  //通过ipvlan设备发送

out:    
        ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
        return ipvlan_process_outbound(skb);    //通过查找路由,发送数据包
}
static int ipvlan_process_outbound(struct sk_buff *skb)
{
        struct ethhdr *ethh = eth_hdr(skb);
        int ret = NET_XMIT_DROP;

        /* In this mode we dont care about multicast and broadcast traffic */
        if (is_multicast_ether_addr(ethh->h_dest)) {    //广播地址直接丢弃,不处理广播
                pr_warn_ratelimited("Dropped {multi|broad}cast of type= [%x]\n",
                                    ntohs(skb->protocol));
                kfree_skb(skb);
                goto out;
        }

        /* The ipvlan is a pseudo-L2 device, so the packets that we receive
         * will have L2; which need to discarded and processed further
         * in the net-ns of the main-device.
         */
        if (skb_mac_header_was_set(skb)) {
                skb_pull(skb, sizeof(*ethh));
                skb->mac_header = (typeof(skb->mac_header))~0U;
                skb_reset_network_header(skb);
        }

        if (skb->protocol == htons(ETH_P_IPV6))
                ret = ipvlan_process_v6_outbound(skb);  //通过ipv6的outbound发送
        else if (skb->protocol == htons(ETH_P_IP))
                ret = ipvlan_process_v4_outbound(skb);  //通过ipv4的outbound发送
        else {
                pr_warn_ratelimited("Dropped outbound packet type=%x\n",
                                    ntohs(skb->protocol));
                kfree_skb(skb);
        }
out:
        return ret;
}
static int ipvlan_process_v4_outbound(struct sk_buff *skb)
{
        const struct iphdr *ip4h = ip_hdr(skb);
        struct net_device *dev = skb->dev;
        struct net *net = dev_net(dev);
        struct rtable *rt;
        int err, ret = NET_XMIT_DROP;
        struct flowi4 fl4 = {
                .flowi4_oif = dev->ifindex,
                .flowi4_tos = RT_TOS(ip4h->tos),
                .flowi4_flags = FLOWI_FLAG_ANYSRC,
                .daddr = ip4h->daddr,
                .saddr = ip4h->saddr,
        };

        rt = ip_route_output_flow(net, &fl4, NULL); //查找路由
        if (IS_ERR(rt))
                goto err;

        if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
                ip_rt_put(rt);
                goto err;
        }
        skb_dst_set(skb, &rt->dst);
        err = ip_local_out(net, skb->sk, skb);  //通过ip_local_out发送数据包,走路由流程发送
        if (unlikely(net_xmit_eval(err)))
                dev->stats.tx_errors++;
        else
                ret = NET_XMIT_SUCCESS;
        goto out;
err:
        dev->stats.tx_errors++;
        kfree_skb(skb);
out:
        return ret;
}

ipvlan设备相当于macvlan设备的扩充,L3模式在发送数据包时 不一定会走ipvlan宿主设备,可能通过路由走其他设备发送出去。

发表评论

电子邮件地址不会被公开。 必填项已用*标注