分类
linux virtual nic 未分类

macvlan 虚拟网卡

macvlan 虚拟网卡设备
macvlan虚拟网卡设备时寄生在物理网卡设备上的。发包时调用自己的发包函数,查找到寄生的物理设备,然后通过物理设备发包。收包时,通过注册寄生的物理设备的rx_handler回调函数,处理数据包。
macvlan 虚拟网卡设备包括5种模式
private 模式:在这种模式下,macvlan设备不能接受寄生在同一个物理网卡的其他macvlan设备的数据包,即使是其他macvlan设备通过物理网卡发送出去并通过hairpin设备返回的包
vepa 模式:在这种模式下,macvlan设备不能直接接受寄生在同一个物理网卡的其他macvlan设备的数据包,但是其他macvlan设备可以将数据包通过物理网卡发送出去,然后通过hairpin设备返回的给其他macvlan设备
passthru 模式:在这种模式下,每一个物理设备只能寄生一个macvlan设备
bridge 模式:在这种模式下,寄生在同一个物理设备的macvlan设备可以直接通讯,不需要外接的hairpin设备帮助
source 模式: 在这种模式下,寄生在物理设备的这类macvlan设备,只能接受指定的源 mac source的数据包,其他数据包都不接受。

macvlan设备 关键数据结构
macvlan_port ,这个数据时在注册rx_handler时使用,作为回调函数的参数。

struct macvlan_port {
        struct net_device       *dev;                           //物理设备
        struct hlist_head       vlan_hash[MACVLAN_HASH_SIZE];   //macvlan设备私有数据
        struct list_head        vlans;                          //macvlan设备私有数据
        struct sk_buff_head     bc_queue;                       //广播报文队列
        struct work_struct      bc_work;                        //发送广播报文进程
        u32                     flags;                          //标志
        int                     count;                          //macvlan设备数量
        struct hlist_head       vlan_source_hash[MACVLAN_HASH_SIZE];    //mac vlan source类型设备专用
        DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
        unsigned char           perm_addr[ETH_ALEN];
};

macvlan_dev ,这个数据结构,时macvlan网卡的私有数据结构,每创建一个macvlan设备就会创建一个设备,并将这个数据结构挂在 macvlan_port 数据结构上。

struct macvlan_dev {
        struct net_device       *dev;           //macvlan网卡设备
        struct list_head        list;           //寄生的macvlan链表
        struct hlist_node       hlist;          //寄生的macvlanhash表
        struct macvlan_port     *port;          //macvlan_port
        struct net_device       *lowerdev;      //寄生的物理设备
        void                    *fwd_priv;      //如果物理网卡支持可以硬件加速
        struct vlan_pcpu_stats __percpu *pcpu_stats;

        DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);

        netdev_features_t       set_features;
        enum macvlan_mode       mode;
        u16                     flags;
        /* This array tracks active taps. */
        struct tap_queue        __rcu *taps[MAX_TAP_QUEUES];
        /* This list tracks all taps (both enabled and disabled) */
        struct list_head        queue_list;
        int                     numvtaps;
        int                     numqueues;
        netdev_features_t       tap_features;
        int                     minor;
        int                     nest_level;
#ifdef CONFIG_NET_POLL_CONTROLLER
        struct netpoll          *netpoll;
#endif
        unsigned int            macaddr_count;
};

两个私有结构之间的关系

digraph tun{
    node [shape = plaintext]
    rankdir = LR
    macvlan_port[label=<
            <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
            <TR><TD BORDER="0" ALIGN="CENTER">struct macvlan_port</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f0">struct net_device *dev</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f1">struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]</TD></TR>
            <TR><TD ALIGN="CENTER">struct list_head vlans</TD></TR>
            <TR><TD ALIGN="LEFT">struct sk_buff_head bc_queue</TD></TR>
            <TR><TD ALIGN="CENTER">struct work_struct bc_work</TD></TR>
            <TR><TD ALIGN="LEFT">u32 flags</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f2">struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            </TABLE>>]
    macvlan_dev[label=<
            <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
            <TR><TD BORDER="0" ALIGN="CENTER">struct macvlan_dev</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f0">strut net_device *dev</TD></TR>
            <TR><TD ALIGN="LEFT">struct list_head list</TD></TR>
            <TR><TD ALIGN="LEFT">struct hlist_node hlist</TD></TR>
            <TR><TD ALIGN="LEFT">struct macvlan_port *port</TD></TR>
            <TR><TD ALIGN="LEFT">struct net_device *lowerdev</TD></TR>
            <TR><TD ALIGN="LEFT">void *fwd_priv</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            </TABLE>>]
    macvlan_port:f1 -> macvlan_dev:f0
    macvlan_port:f2 -> macvlan_dev:f0
}

模块注册

static int __init macvlan_init_module(void)
{
        int err;

        register_netdevice_notifier(&macvlan_notifier_block);   //注册网卡通知

        err = macvlan_link_register(&macvlan_link_ops);     //注册netlink方法
        if (err < 0)
                goto err1;
        return 0;
err1:
        unregister_netdevice_notifier(&macvlan_notifier_block);
        return err;
}

创建macvlan设备

int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
                           struct nlattr *tb[], struct nlattr *data[])
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct macvlan_port *port;
        struct net_device *lowerdev;
        int err;
        int macmode;
        bool create = false;

        if (!tb[IFLA_LINK])
                return -EINVAL;

        lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
        if (lowerdev == NULL)
                return -ENODEV;

        /* When creating macvlans or macvtaps on top of other macvlans - use
         * the real device as the lowerdev.
         */
        if (netif_is_macvlan(lowerdev))
                lowerdev = macvlan_dev_real_dev(lowerdev);  //获取寄生的物理设备

        if (!tb[IFLA_MTU])
                dev->mtu = lowerdev->mtu;
        else if (dev->mtu > lowerdev->mtu)
                return -EINVAL;

        /* MTU range: 68 - lowerdev->max_mtu */
        dev->min_mtu = ETH_MIN_MTU;
        dev->max_mtu = lowerdev->max_mtu;

        if (!tb[IFLA_ADDRESS])
                eth_hw_addr_random(dev);

        if (!macvlan_port_exists(lowerdev)) {
                err = macvlan_port_create(lowerdev);    //如果物理设备还有寄生的macvlan设备创建macvlan_port结构,并注册rx_handler
                if (err < 0)
                        return err;
                create = true;
        }
        port = macvlan_port_get_rtnl(lowerdev);

        /* Only 1 macvlan device can be created in passthru mode */
        if (macvlan_passthru(port)) {   //判断是否有 passthru类型macvlan,如果有退出
                /* The macvlan port must be not created this time,
                 * still goto destroy_macvlan_port for readability.
                 */
                err = -EINVAL;
                goto destroy_macvlan_port;
        }

        vlan->lowerdev = lowerdev;
        vlan->dev      = dev;
        vlan->port     = port;
        vlan->set_features = MACVLAN_FEATURES;
        vlan->nest_level = dev_get_nest_level(lowerdev) + 1;

        vlan->mode     = MACVLAN_MODE_VEPA;
        if (data && data[IFLA_MACVLAN_MODE])
                vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);

        if (data && data[IFLA_MACVLAN_FLAGS])
                vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);

        if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
                if (port->count) {          //如果新的macvlan设备时prassthru类型,且被寄生的物理设备上已经存在macvlan设备,直接退出失败
                        err = -EINVAL;
                        goto destroy_macvlan_port;
                }
                macvlan_set_passthru(port);
                eth_hw_addr_inherit(dev, lowerdev);
        }

        if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
                if (vlan->mode != MACVLAN_MODE_SOURCE) {
                        err = -EINVAL;
                        goto destroy_macvlan_port;
                }
                macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]);
                err = macvlan_changelink_sources(vlan, macmode, data);  //如果设备时 source类型的设备,添加想收的源mac地址,其他地址。这种类型的macvlan设备只收指定源mac包
                if (err)
                        goto destroy_macvlan_port;
        }

        err = register_netdevice(dev);
        if (err < 0)
                goto destroy_macvlan_port;

        dev->priv_flags |= IFF_MACVLAN;
        err = netdev_upper_dev_link(lowerdev, dev);
        if (err)
                goto unregister_netdev;

        list_add_tail_rcu(&vlan->list, &port->vlans);   //将macvlan设备添加到列表中
        netif_stacked_transfer_operstate(lowerdev, dev);
        linkwatch_fire_event(dev);

        return 0;

unregister_netdev:
        unregister_netdevice(dev);
destroy_macvlan_port:
        if (create)
                macvlan_port_destroy(port->dev);
        return err;
}

收包逻辑

static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
{
        struct macvlan_port *port;
        struct sk_buff *skb = *pskb;
        const struct ethhdr *eth = eth_hdr(skb);
        const struct macvlan_dev *vlan;
        const struct macvlan_dev *src;
        struct net_device *dev;
        unsigned int len = 0;
        int ret;
        rx_handler_result_t handle_res;

        port = macvlan_port_get_rcu(skb->dev);
        if (is_multicast_ether_addr(eth->h_dest)) { //广播报文处理逻辑
                unsigned int hash;

                skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN);
                if (!skb)
                        return RX_HANDLER_CONSUMED;
                *pskb = skb;
                eth = eth_hdr(skb);
                macvlan_forward_source(skb, port, eth->h_source);//source 类型macvlan设备收包函数,
                src = macvlan_hash_lookup(port, eth->h_source);//查找发包设备是不是寄生在同一个物理网卡上的macvlan设备
                if (src && src->mode != MACVLAN_MODE_VEPA &&
                    src->mode != MACVLAN_MODE_BRIDGE) { //如果发包设备是寄生在同一个物理网卡上的macvlan设备,且设备类型是 source private passthru类型.则给自己发送广播报文
                        /* forward to original port. */
                        vlan = src;
                        ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?:
                              netif_rx(skb);
                        handle_res = RX_HANDLER_CONSUMED;
                        goto out;
                }

                hash = mc_hash(NULL, eth->h_dest);
                if (test_bit(hash, port->mc_filter))
                        macvlan_broadcast_enqueue(port, src, skb);  //添加到发送队列中,通过工作队列发送广播

                return RX_HANDLER_PASS; //给物理网卡上送广播报文
        }

        macvlan_forward_source(skb, port, eth->h_source); //单播报文source类型收包
        if (macvlan_passthru(port))
                vlan = list_first_or_null_rcu(&port->vlans,
                                              struct macvlan_dev, list);
        else
                vlan = macvlan_hash_lookup(port, eth->h_dest); //查找目的地址对应的macvlan设备
        if (vlan == NULL)
                return RX_HANDLER_PASS; //找不到,继续走物理网卡逻辑

        dev = vlan->dev;
        if (unlikely(!(dev->flags & IFF_UP))) {
                kfree_skb(skb);
                return RX_HANDLER_CONSUMED; //找到了网卡没启动,释放skb
        }
        len = skb->len + ETH_HLEN;
        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb) {
                ret = NET_RX_DROP;
                handle_res = RX_HANDLER_CONSUMED;
                goto out;
        }

        *pskb = skb;
        skb->dev = dev;
        skb->pkt_type = PACKET_HOST; //找到了并且网卡没有问题,设置为本机

        ret = NET_RX_SUCCESS;
        handle_res = RX_HANDLER_ANOTHER;    //走macvlan 网卡逻辑
out:
        macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false);
        return handle_res;
}

发送工作队列,工作函数

static void macvlan_process_broadcast(struct work_struct *w)
{       
        struct macvlan_port *port = container_of(w, struct macvlan_port,
                                                 bc_work);
        struct sk_buff *skb;
        struct sk_buff_head list;

        __skb_queue_head_init(&list);

        spin_lock_bh(&port->bc_queue.lock);
        skb_queue_splice_tail_init(&port->bc_queue, &list);
        spin_unlock_bh(&port->bc_queue.lock);

        while ((skb = __skb_dequeue(&list))) {
                const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src;

                rcu_read_lock();

                if (!src) //如果不是寄生在同一个 物理设备上的macvlan发送,则所有macvlan都 收包
                        /* frame comes from an external address */
                        macvlan_broadcast(skb, port, NULL,
                                          MACVLAN_MODE_PRIVATE |
                                          MACVLAN_MODE_VEPA    |
                                          MACVLAN_MODE_PASSTHRU|
                                          MACVLAN_MODE_BRIDGE);
                else if (src->mode == MACVLAN_MODE_VEPA)//如果是寄生的同一个物理设备上的VEPA类型macvlan发送,则VEPA类型和BRIDGE类型macvlan都受到,private不收
                        /* flood to everyone except source */
                        macvlan_broadcast(skb, port, src->dev,
                                          MACVLAN_MODE_VEPA |
                                          MACVLAN_MODE_BRIDGE);
                else    //如果是寄生在同一个物理设备上的BRIDGE类型macvlan发送,则VEPA类型macvlan受到,private 和 bridge类型不收, bridge不收是因为在发送的时候已经上送给它了。
                        /*
                         * flood only to VEPA ports, bridge ports
                         * already saw the frame on the way out.
                         */
                        macvlan_broadcast(skb, port, src->dev,
                                          MACVLAN_MODE_VEPA);

                rcu_read_unlock();

                if (src)
                        dev_put(src->dev);
                kfree_skb(skb);
        }
}

通过控制广播报文的收发,macvlan保证寄生在同一个物理设备的private设备不能相互收到包。arp协议基于广播报文。

发包流程

static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
                                      struct net_device *dev)
{                       
        unsigned int len = skb->len;
        int ret;
        struct macvlan_dev *vlan = netdev_priv(dev);

        if (unlikely(netpoll_tx_running(dev)))
                return macvlan_netpoll_send_skb(vlan, skb);

        if (vlan->fwd_priv) {
                skb->dev = vlan->lowerdev;
                ret = dev_queue_xmit_accel(skb, vlan->fwd_priv);//硬件加速发送
        } else {
                ret = macvlan_queue_xmit(skb, dev); //没有硬件加速发送方法
        }

        if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { //统计
                struct vlan_pcpu_stats *pcpu_stats;

                pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
                u64_stats_update_begin(&pcpu_stats->syncp);
                pcpu_stats->tx_packets++;
                pcpu_stats->tx_bytes += len;
                u64_stats_update_end(&pcpu_stats->syncp);
        } else {
                this_cpu_inc(vlan->pcpu_stats->tx_dropped);
        }
        return ret;
}

macvlan_queue_xmit函数

static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{
        const struct macvlan_dev *vlan = netdev_priv(dev);
        const struct macvlan_port *port = vlan->port;
        const struct macvlan_dev *dest;

        if (vlan->mode == MACVLAN_MODE_BRIDGE) {    //macvlan如果是bridge模式
                const struct ethhdr *eth = (void *)skb->data;

                /* send to other bridge ports directly */
                if (is_multicast_ether_addr(eth->h_dest)) { //给其他设备发送广播报文,跟前面的收包对应。
                        macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE);
                        goto xmit_world;
                }

                dest = macvlan_hash_lookup(port, eth->h_dest);//查找是否为发给寄生在同一个物理网卡的其他macvlan设备
                if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {//如果是,且接受macvlan也是bridge模式,直接发送由寄生的物理网卡收包。
                        /* send to lowerdev first for its network taps */
                        dev_forward_skb(vlan->lowerdev, skb);

                        return NET_XMIT_SUCCESS;
                }
        }

xmit_world:
        skb->dev = vlan->lowerdev;
        return dev_queue_xmit(skb); //通过物理网卡发送数据包。
}

打开网络设备

static int macvlan_open(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;
        int err;

        if (macvlan_passthru(vlan->port)) {
                if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) {
                        err = dev_set_promiscuity(lowerdev, 1);
                        if (err < 0)
                                goto out;
                }
                goto hash_add;
        }

        if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD &&
            dev->rtnl_link_ops == &macvlan_link_ops) {
                vlan->fwd_priv =
                      lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev);

                /* If we get a NULL pointer back, or if we get an error
                 * then we should just fall through to the non accelerated path
                 */
                if (IS_ERR_OR_NULL(vlan->fwd_priv)) {
                        vlan->fwd_priv = NULL;
                } else
                        return 0;
        }   //如果可以硬件加速 初始化硬件加速。

        err = -EBUSY;
        if (macvlan_addr_busy(vlan->port, dev->dev_addr))
                goto out;

        err = dev_uc_add(lowerdev, dev->dev_addr);
        if (err < 0)
                goto out;
        if (dev->flags & IFF_ALLMULTI) {
                err = dev_set_allmulti(lowerdev, 1);
                if (err < 0)
                        goto del_unicast;
        }

        if (dev->flags & IFF_PROMISC) {
                err = dev_set_promiscuity(lowerdev, 1); //设置混杂模式收包
                if (err < 0)
                        goto clear_multi;
        }

hash_add:
        macvlan_hash_add(vlan); //如果不存在硬件加速将开启的vlan添加到hash表中
        return 0;

clear_multi:
        if (dev->flags & IFF_ALLMULTI)
                dev_set_allmulti(lowerdev, -1);
del_unicast:
        dev_uc_del(lowerdev, dev->dev_addr);
out:
        if (vlan->fwd_priv) {
                lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
                                                           vlan->fwd_priv);
                vlan->fwd_priv = NULL;
        }
        return err;
}

关闭

static int macvlan_stop(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;

        if (vlan->fwd_priv) {
                lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
                                                           vlan->fwd_priv); //删除硬件加速。
                vlan->fwd_priv = NULL;
                return 0;
        }

        dev_uc_unsync(lowerdev, dev);
        dev_mc_unsync(lowerdev, dev);

        if (macvlan_passthru(vlan->port)) {
                if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC))
                        dev_set_promiscuity(lowerdev, -1);
                goto hash_del;
        }

        if (dev->flags & IFF_ALLMULTI)
                dev_set_allmulti(lowerdev, -1);

        if (dev->flags & IFF_PROMISC)
                dev_set_promiscuity(lowerdev, -1); //关闭混杂模式收包

        dev_uc_del(lowerdev, dev->dev_addr); //从 hash表中删除

hash_del:
        macvlan_hash_del(vlan, !dev->dismantle);
        return 0;
}

macvlan当前有很多应用,在docker 和虚拟化中应用很多,在虚拟化中多数使用macvtap设备,在docker中使用macvlan设备。另外,vrrp等一些需要接受其他mac地址的应用也可以使用macvlan设备。

分类
linux virtual nic

tun/tap虚拟网卡

tun/tap虚拟网卡设备
物理网卡设备在发送数据包时,将由线缆将数据包发送到对端设备上。
VETH网卡设备,在发送数据包时,将数据包直接交由配对的网卡收包并上送协议栈。
tun/tap虚拟网卡设备在网卡发包时,将数据包存放在一个缓冲区,当进程读取特定字符设备时,将缓冲区的数据包上送给进程。当进程写特定字符设备时,将数据包发送给网卡收包并上送协议栈。
个人觉得 这种特性就是讲raw socket 和 VETH网卡设备结合起来。当前使用这种网卡设备较多的场景有两个,VPN设备,通过协议栈将隧道数据上送到用户态进程,对隧道中的数据进行解密,然后将数据通过写入字符设备,再次通过协议栈对解密后的数据进行路由或者上送到其他进程处理。第二个比较常用的场景是在虚拟化中。tun/tap设备加入网桥中,客户机发包时通过字符设备将数据包下发到桥中,客户机收包时,在缓冲区收取桥送过来的数据。

由于这个网卡涉及到字符设备和网卡设备,因此有两个关键的私有数据结构
字符设备私有数据结构

struct tun_file {
        struct sock sk;                         //用来唤醒阻塞的用户态读取进程
        struct socket socket;                   //在vhost_net中用来在内核直接读取和写入数据
        struct socket_wq wq;                    //读取进程阻塞等待变量
        struct tun_struct __rcu *tun;           //字符设备对应的网卡设备的私有数据结构
        struct fasync_struct *fasync;
        /* only used for fasnyc */
        unsigned int flags;
        union {
                u16 queue_index;
                unsigned int ifindex;
        };
        struct list_head next;
        struct tun_struct *detached;
        struct skb_array tx_array;               //该字符设备的缓冲区,网卡发送数据时直接将数据填入该缓冲区
};

网卡设备私有数据结构

struct tun_struct {
        struct tun_file __rcu   *tfiles[MAX_TAP_QUEUES]; //网卡设备对应的字符设备。一个网卡设备可以对应多个字符设备的打开实例,多队列
        unsigned int            numqueues;               //总共的队列数量
        unsigned int            flags;                  //tun/tap设备特性描述
        kuid_t                  owner;
        kgid_t                  group;

        struct net_device       *dev;                   //对应的网卡设备
        netdev_features_t       set_features;
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
                          NETIF_F_TSO6|NETIF_F_UFO)

        int                     align;
        int                     vnet_hdr_sz;
        int                     sndbuf;
        struct tap_filter       txflt;
        struct sock_fprog       fprog;
        /* protected by rtnl lock */
        bool                    filter_attached;
#ifdef TUN_DEBUG
        int debug;
#endif
        spinlock_t lock;
        struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
        struct timer_list flow_gc_timer;
        unsigned long ageing_time;
        unsigned int numdisabled;
        struct list_head disabled;
        void *security;
        u32 flow_count;
        u32 rx_batched;
        struct tun_pcpu_stats __percpu *pcpu_stats;    //网卡收发包统计数据
};

两个私有数据结构之间的关系

digraph tun{
    node [shape = plaintext]
    rankdir = LR
    tun_struct[label=<
            <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
            <TR><TD BORDER="0" ALIGN="CENTER">struct tun_struct</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f0">struct tun_file *tfiles[MAX_TAP_QUEUES]</TD></TR>
            <TR><TD ALIGN="LEFT">unsigned int numqueues</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            <TR><TD ALIGN="LEFT">struct net_device *dev</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            <TR><TD ALIGN="LEFT">struct hlist_head flows[TUN_NUM_FLOW_ENTRIES]</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            <TR><TD ALIGN="LEFT">struct tun_pcpu_stats __percpu *pcpu_status</TD></TR>
            </TABLE>>]
    tun_file[label=<
            <TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
            <TR><TD BORDER="0" ALIGN="CENTER">struct tun_file</TD></TR>
            <TR><TD ALIGN="LEFT" PORT="f0">struct sock sk</TD></TR>
            <TR><TD ALIGN="LEFT">struct socket socket</TD></TR>
            <TR><TD ALIGN="LEFT">socket_wq wq</TD></TR>
            <TR><TD ALIGN="LEFT">struct tun_struct *tun</TD></TR>
            <TR><TD ALIGN="CENTER">......</TD></TR>
            <TR><TD ALIGN="LEFT">struct tun_struct *detached</TD></TR>
            <TR><TD ALIGN="LEFT">struct skb_array tx_array</TD></TR>
            </TABLE>>]
    tun_struct:f0 -> tun_file:f0
}

模块注册

static int __init tun_init(void)
{
        int ret = 0;

        pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);

        ret = rtnl_link_register(&tun_link_ops);                          //注册netlink创建网卡的接口,当前应该没有通过这个创建
        if (ret) {
                pr_err("Can't register link_ops\n");
                goto err_linkops;
        }

        ret = misc_register(&tun_miscdev);        //注册字符设备,看代码用户态通过字符设备的ioctl接口创建网卡。
        if (ret) {
                pr_err("Can't register misc device %d\n", TUN_MINOR);
                goto err_misc;
        }

        register_netdevice_notifier(&tun_notifier_block);
        return  0;
err_misc:
        rtnl_link_unregister(&tun_link_ops);
err_linkops:
        return ret;
}

打开字符设备

static int tun_chr_open(struct inode *inode, struct file * file)
{       
        struct net *net = current->nsproxy->net_ns;
        struct tun_file *tfile;

        DBG1(KERN_INFO, "tunX: tun_chr_open\n");

        tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
                                            &tun_proto, 0); //创建字符文件的私有数据结构,该结构用来缓存skb及当有数据时唤醒读进程
        if (!tfile)
                return -ENOMEM;
        RCU_INIT_POINTER(tfile->tun, NULL);
        tfile->flags = 0;
        tfile->ifindex = 0;

        init_waitqueue_head(&tfile->wq.wait);
        RCU_INIT_POINTER(tfile->socket.wq, &tfile->wq);

        tfile->socket.file = file;
        tfile->socket.ops = &tun_socket_ops;            //vhost-net会使用这个调用发送和接受网卡数据。

        sock_init_data(&tfile->socket, &tfile->sk);     //初始化sk 和socket变量,

        tfile->sk.sk_write_space = tun_sock_write_space;
        tfile->sk.sk_sndbuf = INT_MAX;

        file->private_data = tfile;
        INIT_LIST_HEAD(&tfile->next);

        sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);

        return 0;
}

通过 ioctl设置字符设备对应的网卡设备。

static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg, int ifreq_len)
{
......
       if (cmd == TUNSETIFF) {
                ret = -EEXIST;
                if (tun)
                        goto unlock;

                ifr.ifr_name[IFNAMSIZ-1] = '\0';

                ret = tun_set_iff(sock_net(&tfile->sk), file, &ifr); //设置关联网卡

                if (ret)
                        goto unlock;

                if (copy_to_user(argp, &ifr, ifreq_len))
                        ret = -EFAULT;
                goto unlock;
        }
......
}

tun_set_iff 函数实体,这个函数主要是将文件的私有结构和网卡的私有结构关联。当网卡有数据时,直接存入文件的私有结构的缓冲区,当文件写入时,调用文件的关联网卡上送数据到协议栈

static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
{
        struct tun_struct *tun;
        struct tun_file *tfile = file->private_data;
        struct net_device *dev;
        int err;

        if (tfile->detached)
                return -EINVAL;

        dev = __dev_get_by_name(net, ifr->ifr_name);  //查找网卡设备是否存在
        if (dev) {  
                if (ifr->ifr_flags & IFF_TUN_EXCL)
                        return -EBUSY;
                if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
                        tun = netdev_priv(dev);
                else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
                        tun = netdev_priv(dev);
                else
                        return -EINVAL;

                if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
                    !!(tun->flags & IFF_MULTI_QUEUE))
                        return -EINVAL;

                if (tun_not_capable(tun))
                        return -EPERM;
                err = security_tun_dev_open(tun->security);
                if (err < 0)
                        return err;

                err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER); // 将当前文件的私有数据结构和网卡的私有数据结构关联。将tfile 存入tun结构的tfiles数组中
                if (err < 0)
                        return err;

                if (tun->flags & IFF_MULTI_QUEUE &&
                    (tun->numqueues + tun->numdisabled > 1)) {
                        /* One or more queue has already been attached, no need
                         * to initialize the device again.
                         */
                        return 0;
                }
        }
        else {
                char *name;
                unsigned long flags = 0;
                int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
                             MAX_TAP_QUEUES : 1;

                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                err = security_tun_dev_create();
                if (err < 0)
                        return err;

                /* Set dev type */
                if (ifr->ifr_flags & IFF_TUN) {
                        /* TUN device */
                        flags |= IFF_TUN;
                        name = "tun%d";
                } else if (ifr->ifr_flags & IFF_TAP) {
                        /* TAP device */
                        flags |= IFF_TAP;
                        name = "tap%d";
                } else
                        return -EINVAL;

                if (*ifr->ifr_name)
                        name = ifr->ifr_name;

                dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
                                       NET_NAME_UNKNOWN, tun_setup, queues,
                                       queues); //如果网卡设备不存在则创建网卡设备

                if (!dev)
                        return -ENOMEM;

                dev_net_set(dev, net);
                dev->rtnl_link_ops = &tun_link_ops;
                dev->ifindex = tfile->ifindex;
                dev->sysfs_groups[0] = &tun_attr_group;

                tun = netdev_priv(dev);
                tun->dev = dev;
                tun->flags = flags;
                tun->txflt.count = 0;
                tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);

                tun->align = NET_SKB_PAD;
                tun->filter_attached = false;
                tun->sndbuf = tfile->socket.sk->sk_sndbuf;
                tun->rx_batched = 0;

                tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
                if (!tun->pcpu_stats) {
                        err = -ENOMEM;
                        goto err_free_dev;
                }

                spin_lock_init(&tun->lock);

                err = security_tun_dev_alloc_security(&tun->security);
                if (err < 0)
                        goto err_free_stat;

                tun_net_init(dev);
                tun_flow_init(tun);

                dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
                                   TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
                                   NETIF_F_HW_VLAN_STAG_TX;
                dev->features = dev->hw_features | NETIF_F_LLTX;
                dev->vlan_features = dev->features &
                                     ~(NETIF_F_HW_VLAN_CTAG_TX |
                                       NETIF_F_HW_VLAN_STAG_TX);

                INIT_LIST_HEAD(&tun->disabled);    //上面所有的步骤都是在初始化网卡和网卡的私有数据结构tun
                err = tun_attach(tun, file, false); //将当前文件的私有数据结构和网卡的私有数据结构关联。将tfile 存入tun结构的tfiles数组中
                if (err < 0)
                        goto err_free_flow;

                err = register_netdevice(tun->dev);
                if (err < 0)
                        goto err_detach;
        }

        netif_carrier_on(tun->dev);

        tun_debug(KERN_INFO, tun, "tun_set_iff\n");

        tun->flags = (tun->flags & ~TUN_FEATURES) |
                (ifr->ifr_flags & TUN_FEATURES);

        /* Make sure persistent devices do not get stuck in
         * xoff state.
         */
        if (netif_running(tun->dev))
                netif_tx_wake_all_queues(tun->dev);

        strcpy(ifr->ifr_name, tun->dev->name);
        return 0;

err_detach:
        tun_detach_all(dev);
err_free_flow:
        tun_flow_uninit(tun);
        security_tun_dev_free_security(tun->security);
err_free_stat:
        free_percpu(tun->pcpu_stats);
err_free_dev:
        free_netdev(dev);
        return err;
}

至此,tun设备 已经创建完成。

通过tun设备发送数据。
发送函数是tun_chr_write_iter

static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct tun_struct *tun = tun_get(file);
        struct tun_file *tfile = file->private_data;
        ssize_t result;

        if (!tun)
                return -EBADFD;

        result = tun_get_user(tun, tfile, NULL, from,
                              file->f_flags & O_NONBLOCK, false);   //获取用户态发送过来的skb结构,然后调用netif_rx_ni 收包

        tun_put(tun);
        return result;
}

tun_get_user函数 实体

static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                            void *msg_control, struct iov_iter *from,
                            int noblock, bool more)
{
......
        skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
        if (IS_ERR(skb)) {
                if (PTR_ERR(skb) != -EAGAIN)
                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
                return PTR_ERR(skb);
        }

        if (zerocopy)
                err = zerocopy_sg_from_iter(skb, from);
        else
                err = skb_copy_datagram_from_iter(skb, 0, from, len);   //创建并获取数据
......
#ifndef CONFIG_4KSTACKS
        tun_rx_batched(tun, tfile, skb, more);  //数据批量上送协议栈
#else
        netif_rx_ni(skb);       //数据上送协议栈
#endif

        stats = get_cpu_ptr(tun->pcpu_stats);
        u64_stats_update_begin(&stats->syncp);
        stats->rx_packets++;
        stats->rx_bytes += len;
        u64_stats_update_end(&stats->syncp);
        put_cpu_ptr(stats);

        tun_flow_update(tun, rxhash, tfile);
        return total_len;
}

通过tun 设备收数据包
网卡设备最终调用tun_net_xmit函数发送数据包

static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);
        int txq = skb->queue_mapping;
        struct tun_file *tfile;
        u32 numqueues = 0;

        rcu_read_lock();
        tfile = rcu_dereference(tun->tfiles[txq]);//获取网卡关联的文件私有数据结构。
        numqueues = ACCESS_ONCE(tun->numqueues);

        /* Drop packet if interface is not attached */
        if (txq >= numqueues)
                goto drop;
......
        if (skb_array_produce(&tfile->tx_array, skb))   //将数据存放到文件私有数据结构的缓冲区内
                goto drop;

        /* Notify and wake up reader process */
        if (tfile->flags & TUN_FASYNC)
                kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
        tfile->socket.sk->sk_data_ready(tfile->socket.sk);     //唤醒读文件阻塞的进程

        rcu_read_unlock();
        return NETDEV_TX_OK;

drop:
        this_cpu_inc(tun->pcpu_stats->tx_dropped);
        skb_tx_error(skb);
        kfree_skb(skb);
        rcu_read_unlock();
        return NET_XMIT_DROP;
}

文件读取数据函数

static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun = __tun_get(tfile);
        ssize_t len = iov_iter_count(to), ret;

        if (!tun)
                return -EBADFD;
        ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK, NULL);    //读取文件缓冲区的数据
        ret = min_t(ssize_t, ret, len);
        if (ret > 0)
                iocb->ki_pos = ret;
        tun_put(tun);
        return ret;
}

tun_do_read函数实体

static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
                           struct iov_iter *to,
                           int noblock, struct sk_buff *skb)
{
        ssize_t ret;
        int err;

        tun_debug(KERN_INFO, tun, "tun_do_read\n");

        if (!iov_iter_count(to))
                return 0;

        if (!skb) {
                /* Read frames from ring */
                skb = tun_ring_recv(tfile, noblock, &err); //读取缓冲区的数据
                if (!skb)
                        return err;
        }

        ret = tun_put_user(tun, tfile, skb, to);        //将数据存放到用户态进程内存中
        if (unlikely(ret < 0))
                kfree_skb(skb);
        else
                consume_skb(skb);

        return ret;
}

tun_ring_recv函数实体

static struct sk_buff *tun_ring_recv(struct tun_file *tfile, int noblock,
                                     int *err)
{
        DECLARE_WAITQUEUE(wait, current);
        struct sk_buff *skb = NULL;
        int error = 0;

        skb = skb_array_consume(&tfile->tx_array);  //如果缓冲区里面有skb,获取skb
        if (skb)
                goto out;
        if (noblock) {
                error = -EAGAIN;
                goto out;
        }

        add_wait_queue(&tfile->wq.wait, &wait);
        current->state = TASK_INTERRUPTIBLE;    //如果缓冲区没有skb,将进程设置为可中断等待状态

        while (1) {
                skb = skb_array_consume(&tfile->tx_array);
                if (skb)
                        break;
                if (signal_pending(current)) {
                        error = -ERESTARTSYS;
                        break;
                }
                if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
                        error = -EFAULT;
                        break;
                }

                schedule();                     //切换到别的进程执行。等待直到缓冲区中有skb
        }

        current->state = TASK_RUNNING;
        remove_wait_queue(&tfile->wq.wait, &wait);

out:
        *err = error;
        return skb;                             //返回skb
}

至此,tun/tap设备的收发包过程就已经清楚了。
tun/tap 设备在做虚拟机backend时,如果有vhost-net,还有另外一条收发包路径,等分析vhost-net我在细说

分类
linux virtual nic

loopback 网卡

LOOPBACK 回环网卡私有数据结构
该网卡没有私有数据结构

内核模块注册

struct pernet_operations __net_initdata loopback_net_ops = {
        .init = loopback_net_init,
};

在网络命令空间中每个网络命名空间必须有一个lo网卡,因此新创建的命令空间需要先运行 loopback_net_init函数给当前命名空间创建 lo网卡

网卡创建

static __net_init int loopback_net_init(struct net *net)
{
        struct net_device *dev;
        int err;

        err = -ENOMEM;
        dev = alloc_netdev(0, "lo", NET_NAME_UNKNOWN, loopback_setup);//创建网卡
        if (!dev)
                goto out;

        dev_net_set(dev, net);//设置网卡所属命令空间
        err = register_netdev(dev);//注册网卡
        if (err)
                goto out_free_netdev;

        BUG_ON(dev->ifindex != LOOPBACK_IFINDEX); //如果环回网卡不是第一块网卡,则报错
        net->loopback_dev = dev; //设置当前命名空间的环回网卡
        return 0;

out_free_netdev:
        free_netdev(dev);
out:
        if (net_eq(net, &init_net))
                panic("loopback: Failed to register netdevice: %d\n", err);
        return err;
}

初始化网卡,及环回网卡操作函数

static const struct net_device_ops loopback_ops = {
        .ndo_init        = loopback_dev_init,
        .ndo_start_xmit  = loopback_xmit,  //网卡发送函数
        .ndo_get_stats64 = loopback_get_stats64,
        .ndo_set_mac_address = eth_mac_addr,
};

/* The loopback device is special. There is only one instance
 * per network namespace.
 */
static void loopback_setup(struct net_device *dev)
{
        dev->mtu                = 64 * 1024;
        dev->hard_header_len    = ETH_HLEN;     /* 14   */
        dev->min_header_len     = ETH_HLEN;     /* 14   */
        dev->addr_len           = ETH_ALEN;     /* 6    */
        dev->type               = ARPHRD_LOOPBACK;      /* 0x0001*/
        dev->flags              = IFF_LOOPBACK;
        dev->priv_flags         |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
        netif_keep_dst(dev);
        dev->hw_features        = NETIF_F_GSO_SOFTWARE;
        dev->features           = NETIF_F_SG | NETIF_F_FRAGLIST
                | NETIF_F_GSO_SOFTWARE
                | NETIF_F_HW_CSUM
                | NETIF_F_RXCSUM
                | NETIF_F_SCTP_CRC
                | NETIF_F_HIGHDMA
                | NETIF_F_LLTX
                | NETIF_F_NETNS_LOCAL
                | NETIF_F_VLAN_CHALLENGED
                | NETIF_F_LOOPBACK;
        dev->ethtool_ops        = &loopback_ethtool_ops;
        dev->header_ops         = &eth_header_ops;
        dev->netdev_ops         = &loopback_ops; //设置网卡操作函数
        dev->needs_free_netdev  = true;
        dev->priv_destructor    = loopback_dev_free;
}

网卡发送函数

static netdev_tx_t loopback_xmit(struct sk_buff *skb,
                                 struct net_device *dev)
{
        struct pcpu_lstats *lb_stats;
        int len;

        skb_tx_timestamp(skb);
        skb_orphan(skb);

        /* Before queueing this packet to netif_rx(),
         * make sure dst is refcounted.
         */
        skb_dst_force(skb);

        skb->protocol = eth_type_trans(skb, dev); //获取数据包类型,设置数据包接受网卡

        /* it's OK to use per_cpu_ptr() because BHs are off */
        lb_stats = this_cpu_ptr(dev->lstats);

        len = skb->len;
        if (likely(netif_rx(skb) == NET_RX_SUCCESS)) {  //通过netif_rx将数据包上送到协议栈
                u64_stats_update_begin(&lb_stats->syncp);
                lb_stats->bytes += len;
                lb_stats->packets++;
                u64_stats_update_end(&lb_stats->syncp);
        }

        return NETDEV_TX_OK;
}

loopback网卡,是linux 设备中的第一个网卡,在本机发往本地任何网卡IP的数据包都是走lo网卡。

分类
linux virtual nic

VETH 网卡

VETH 类型网卡私有数据结构

struct veth_priv {
    struct net_device  __rcu *peer;
    atomic64_t      dropped;
    unsigned        requested_headromm;
}

veth_link_ops结构体实例

static struct rtnl_link_ops veth_link_ops ={
    .kind           = DRV_NAME,                 //虚拟网卡名字
    .priv_size      = sizeof(struct veth_priv), //虚拟网卡private data 长度
    .setup          = veth_setup,               //在创建虚拟网卡时调用
    .validate       = veth_validate,            //检查用户态传过来参数是否合法
    .newlink        = veth_newlink,             //根据用户态参数设置网卡 
    .dellink        = veth_dellink,             //删除网卡时调用,用于释放资源
    .policy         = veth_policy,              //没有跟踪用法,不影响逻辑
    .maxtype        = VETH_INFO_MAX,            //没有跟踪用法,不影响逻辑
    .get_link_net   = veth_get_link_net,        //获取网卡所在的网络命令空间
};

内核模块注册

static __init int veth_init(void)
{
    return rtnl_link_register(&veth_link_ops); //注册veth_link_ops结构,在用户态通过netlink,创建这种类型的虚拟网卡
}

创建虚拟网卡 逻辑
用户态通过 netlink 通知创建虚拟网卡,创建时先调用 veth_setup 函数 再调用 veth_newlink函数
veth_setup 函数

static void veth_setup(struct net_device *dev)
{
    ether_setup(dev);

    dev->priv_flags &= ~IFF_TX_SKB_SHARING;
    dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
    dev->priv_flags |= IFF_NO_QUEUE;
    dev->priv_flags |= IFF_PHONY_HEADROOM;

    dev->netdev_ops = &veth_netdev_ops;   //设置网卡处理函数 发送函数是 veth_xmit 这个函数跟数据流有关系。
    dev->ethtool_ops = &veth_ethtool_ops; //ethtool 命令通过ioctl获取网卡信息
    dev->features |= NETIF_F_LLTX;
    dev->features |= VETH_FEATURES;
    dev->vlan_features = dev->features &
                        ~(NETIF_F_HW_VLAN_CTAG_TX |
                        NETIF_F_HW_VLAN_STAG_TX |
                        NETIF_F_HW_VLAN_CTAG_RX |
                        NETIF_F_HW_VLAN_STAG_RX);
    dev->needs_free_netdev = true;
    dev->priv_destructor = veth_dev_free;
    dev->max_mtu = ETH_MAX_MTU;

    dev->hw_features = VETH_FEATURES;
    dev->hw_enc_features = VETH_FEATURES;
    dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
    }

veth_newlink 函数 主要逻辑如下

static int veth_newlink(struct net *src_net, struct net_device *dev,
                        struct nlattr *tb[], struct nlattr *data[],
                        struct netlink_ext_ack *extack)
{
    .......
    peer = rtnl_create_link(net, ifname, name_assign_type,
                                &veth_link_ops, tbp); //创建 配对网卡
    .......
    priv = netdev_priv(dev);
    rcu_assign_pointer(priv->peer, peer); //将private 数据结构中的 peer 设置为对端网卡

    priv = netdev_priv(peer);
    rcu_assign_pointer(priv->peer, dev); //将private 数据结构中的 peer 设置为对端网卡。
    return 0;

}

到这里网卡 创建完成。接下来是收发数据包。
发包流程
在 veth_setup 函数中我们看到 veth_xmit 是网卡发包函数。
veth_xmit 函数体如下

static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
{       
    struct veth_priv *priv = netdev_priv(dev);
    struct net_device *rcv;
    int length = skb->len;

    rcu_read_lock();
    rcv = rcu_dereference(priv->peer); //获取对端设备。
    if (unlikely(!rcv)) {
        kfree_skb(skb);
        goto drop;
    }

    if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {  //发送数据
        struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats); //统计发送成功的数据

        u64_stats_update_begin(&stats->syncp);
        stats->bytes += length;
         stats->packets++;
        u64_stats_update_end(&stats->syncp);
    } else {
drop:
        atomic64_inc(&priv->dropped);   //统计发送失败的数据
    }
    rcu_read_unlock();
    return NETDEV_TX_OK;
}

dev_forward_skb 函数体

int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
    return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); //通过__dev_forward_skb设置skb ,通过netif_rx_internal 将数据上送到协议栈
}
EXPORT_SYMBOL_GPL(dev_forward_skb);

__dev_forward_skb 函数体

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
    int ret = ____dev_forward_skb(dev, skb);

    if (likely(!ret)) {
        skb->protocol = eth_type_trans(skb, dev); //获取以太网以上的网络协议,将收包网卡设置dev
        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
    }

    return ret;
}
EXPORT_SYMBOL_GPL(__dev_forward_skb);

收报流程。收报流程在发包流程中可以看到,在对端网卡发包时,通过调用netif_rx_internal函数将数据包上送到协议栈,之后的处理流程就是网卡的收包流程