macvlan 虚拟网卡设备
macvlan虚拟网卡设备时寄生在物理网卡设备上的。发包时调用自己的发包函数,查找到寄生的物理设备,然后通过物理设备发包。收包时,通过注册寄生的物理设备的rx_handler回调函数,处理数据包。
macvlan 虚拟网卡设备包括5种模式
private 模式:在这种模式下,macvlan设备不能接受寄生在同一个物理网卡的其他macvlan设备的数据包,即使是其他macvlan设备通过物理网卡发送出去并通过hairpin设备返回的包
vepa 模式:在这种模式下,macvlan设备不能直接接受寄生在同一个物理网卡的其他macvlan设备的数据包,但是其他macvlan设备可以将数据包通过物理网卡发送出去,然后通过hairpin设备返回的给其他macvlan设备
passthru 模式:在这种模式下,每一个物理设备只能寄生一个macvlan设备
bridge 模式:在这种模式下,寄生在同一个物理设备的macvlan设备可以直接通讯,不需要外接的hairpin设备帮助
source 模式: 在这种模式下,寄生在物理设备的这类macvlan设备,只能接受指定的源 mac source的数据包,其他数据包都不接受。
macvlan设备 关键数据结构
macvlan_port ,这个数据时在注册rx_handler时使用,作为回调函数的参数。
struct macvlan_port {
struct net_device *dev; //物理设备
struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]; //macvlan设备私有数据
struct list_head vlans; //macvlan设备私有数据
struct sk_buff_head bc_queue; //广播报文队列
struct work_struct bc_work; //发送广播报文进程
u32 flags; //标志
int count; //macvlan设备数量
struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]; //mac vlan source类型设备专用
DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
unsigned char perm_addr[ETH_ALEN];
};
macvlan_dev ,这个数据结构,时macvlan网卡的私有数据结构,每创建一个macvlan设备就会创建一个设备,并将这个数据结构挂在 macvlan_port 数据结构上。
struct macvlan_dev {
struct net_device *dev; //macvlan网卡设备
struct list_head list; //寄生的macvlan链表
struct hlist_node hlist; //寄生的macvlanhash表
struct macvlan_port *port; //macvlan_port
struct net_device *lowerdev; //寄生的物理设备
void *fwd_priv; //如果物理网卡支持可以硬件加速
struct vlan_pcpu_stats __percpu *pcpu_stats;
DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
netdev_features_t set_features;
enum macvlan_mode mode;
u16 flags;
/* This array tracks active taps. */
struct tap_queue __rcu *taps[MAX_TAP_QUEUES];
/* This list tracks all taps (both enabled and disabled) */
struct list_head queue_list;
int numvtaps;
int numqueues;
netdev_features_t tap_features;
int minor;
int nest_level;
#ifdef CONFIG_NET_POLL_CONTROLLER
struct netpoll *netpoll;
#endif
unsigned int macaddr_count;
};
两个私有结构之间的关系
digraph tun{
node [shape = plaintext]
rankdir = LR
macvlan_port[label=<
<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
<TR><TD BORDER="0" ALIGN="CENTER">struct macvlan_port</TD></TR>
<TR><TD ALIGN="LEFT" PORT="f0">struct net_device *dev</TD></TR>
<TR><TD ALIGN="LEFT" PORT="f1">struct hlist_head vlan_hash[MACVLAN_HASH_SIZE]</TD></TR>
<TR><TD ALIGN="CENTER">struct list_head vlans</TD></TR>
<TR><TD ALIGN="LEFT">struct sk_buff_head bc_queue</TD></TR>
<TR><TD ALIGN="CENTER">struct work_struct bc_work</TD></TR>
<TR><TD ALIGN="LEFT">u32 flags</TD></TR>
<TR><TD ALIGN="CENTER">......</TD></TR>
<TR><TD ALIGN="LEFT" PORT="f2">struct hlist_head vlan_source_hash[MACVLAN_HASH_SIZE]</TD></TR>
<TR><TD ALIGN="CENTER">......</TD></TR>
</TABLE>>]
macvlan_dev[label=<
<TABLE BORDER="0" CELLBORDER="1" CELLSPACING="0">
<TR><TD BORDER="0" ALIGN="CENTER">struct macvlan_dev</TD></TR>
<TR><TD ALIGN="LEFT" PORT="f0">strut net_device *dev</TD></TR>
<TR><TD ALIGN="LEFT">struct list_head list</TD></TR>
<TR><TD ALIGN="LEFT">struct hlist_node hlist</TD></TR>
<TR><TD ALIGN="LEFT">struct macvlan_port *port</TD></TR>
<TR><TD ALIGN="LEFT">struct net_device *lowerdev</TD></TR>
<TR><TD ALIGN="LEFT">void *fwd_priv</TD></TR>
<TR><TD ALIGN="CENTER">......</TD></TR>
</TABLE>>]
macvlan_port:f1 -> macvlan_dev:f0
macvlan_port:f2 -> macvlan_dev:f0
}
模块注册
static int __init macvlan_init_module(void)
{
int err;
register_netdevice_notifier(&macvlan_notifier_block); //注册网卡通知
err = macvlan_link_register(&macvlan_link_ops); //注册netlink方法
if (err < 0)
goto err1;
return 0;
err1:
unregister_netdevice_notifier(&macvlan_notifier_block);
return err;
}
创建macvlan设备
int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct macvlan_port *port;
struct net_device *lowerdev;
int err;
int macmode;
bool create = false;
if (!tb[IFLA_LINK])
return -EINVAL;
lowerdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
if (lowerdev == NULL)
return -ENODEV;
/* When creating macvlans or macvtaps on top of other macvlans - use
* the real device as the lowerdev.
*/
if (netif_is_macvlan(lowerdev))
lowerdev = macvlan_dev_real_dev(lowerdev); //获取寄生的物理设备
if (!tb[IFLA_MTU])
dev->mtu = lowerdev->mtu;
else if (dev->mtu > lowerdev->mtu)
return -EINVAL;
/* MTU range: 68 - lowerdev->max_mtu */
dev->min_mtu = ETH_MIN_MTU;
dev->max_mtu = lowerdev->max_mtu;
if (!tb[IFLA_ADDRESS])
eth_hw_addr_random(dev);
if (!macvlan_port_exists(lowerdev)) {
err = macvlan_port_create(lowerdev); //如果物理设备还有寄生的macvlan设备创建macvlan_port结构,并注册rx_handler
if (err < 0)
return err;
create = true;
}
port = macvlan_port_get_rtnl(lowerdev);
/* Only 1 macvlan device can be created in passthru mode */
if (macvlan_passthru(port)) { //判断是否有 passthru类型macvlan,如果有退出
/* The macvlan port must be not created this time,
* still goto destroy_macvlan_port for readability.
*/
err = -EINVAL;
goto destroy_macvlan_port;
}
vlan->lowerdev = lowerdev;
vlan->dev = dev;
vlan->port = port;
vlan->set_features = MACVLAN_FEATURES;
vlan->nest_level = dev_get_nest_level(lowerdev) + 1;
vlan->mode = MACVLAN_MODE_VEPA;
if (data && data[IFLA_MACVLAN_MODE])
vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);
if (data && data[IFLA_MACVLAN_FLAGS])
vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);
if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
if (port->count) { //如果新的macvlan设备时prassthru类型,且被寄生的物理设备上已经存在macvlan设备,直接退出失败
err = -EINVAL;
goto destroy_macvlan_port;
}
macvlan_set_passthru(port);
eth_hw_addr_inherit(dev, lowerdev);
}
if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
if (vlan->mode != MACVLAN_MODE_SOURCE) {
err = -EINVAL;
goto destroy_macvlan_port;
}
macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]);
err = macvlan_changelink_sources(vlan, macmode, data); //如果设备时 source类型的设备,添加想收的源mac地址,其他地址。这种类型的macvlan设备只收指定源mac包
if (err)
goto destroy_macvlan_port;
}
err = register_netdevice(dev);
if (err < 0)
goto destroy_macvlan_port;
dev->priv_flags |= IFF_MACVLAN;
err = netdev_upper_dev_link(lowerdev, dev);
if (err)
goto unregister_netdev;
list_add_tail_rcu(&vlan->list, &port->vlans); //将macvlan设备添加到列表中
netif_stacked_transfer_operstate(lowerdev, dev);
linkwatch_fire_event(dev);
return 0;
unregister_netdev:
unregister_netdevice(dev);
destroy_macvlan_port:
if (create)
macvlan_port_destroy(port->dev);
return err;
}
收包逻辑
static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
{
struct macvlan_port *port;
struct sk_buff *skb = *pskb;
const struct ethhdr *eth = eth_hdr(skb);
const struct macvlan_dev *vlan;
const struct macvlan_dev *src;
struct net_device *dev;
unsigned int len = 0;
int ret;
rx_handler_result_t handle_res;
port = macvlan_port_get_rcu(skb->dev);
if (is_multicast_ether_addr(eth->h_dest)) { //广播报文处理逻辑
unsigned int hash;
skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN);
if (!skb)
return RX_HANDLER_CONSUMED;
*pskb = skb;
eth = eth_hdr(skb);
macvlan_forward_source(skb, port, eth->h_source);//source 类型macvlan设备收包函数,
src = macvlan_hash_lookup(port, eth->h_source);//查找发包设备是不是寄生在同一个物理网卡上的macvlan设备
if (src && src->mode != MACVLAN_MODE_VEPA &&
src->mode != MACVLAN_MODE_BRIDGE) { //如果发包设备是寄生在同一个物理网卡上的macvlan设备,且设备类型是 source private passthru类型.则给自己发送广播报文
/* forward to original port. */
vlan = src;
ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?:
netif_rx(skb);
handle_res = RX_HANDLER_CONSUMED;
goto out;
}
hash = mc_hash(NULL, eth->h_dest);
if (test_bit(hash, port->mc_filter))
macvlan_broadcast_enqueue(port, src, skb); //添加到发送队列中,通过工作队列发送广播
return RX_HANDLER_PASS; //给物理网卡上送广播报文
}
macvlan_forward_source(skb, port, eth->h_source); //单播报文source类型收包
if (macvlan_passthru(port))
vlan = list_first_or_null_rcu(&port->vlans,
struct macvlan_dev, list);
else
vlan = macvlan_hash_lookup(port, eth->h_dest); //查找目的地址对应的macvlan设备
if (vlan == NULL)
return RX_HANDLER_PASS; //找不到,继续走物理网卡逻辑
dev = vlan->dev;
if (unlikely(!(dev->flags & IFF_UP))) {
kfree_skb(skb);
return RX_HANDLER_CONSUMED; //找到了网卡没启动,释放skb
}
len = skb->len + ETH_HLEN;
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb) {
ret = NET_RX_DROP;
handle_res = RX_HANDLER_CONSUMED;
goto out;
}
*pskb = skb;
skb->dev = dev;
skb->pkt_type = PACKET_HOST; //找到了并且网卡没有问题,设置为本机
ret = NET_RX_SUCCESS;
handle_res = RX_HANDLER_ANOTHER; //走macvlan 网卡逻辑
out:
macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false);
return handle_res;
}
发送工作队列,工作函数
static void macvlan_process_broadcast(struct work_struct *w)
{
struct macvlan_port *port = container_of(w, struct macvlan_port,
bc_work);
struct sk_buff *skb;
struct sk_buff_head list;
__skb_queue_head_init(&list);
spin_lock_bh(&port->bc_queue.lock);
skb_queue_splice_tail_init(&port->bc_queue, &list);
spin_unlock_bh(&port->bc_queue.lock);
while ((skb = __skb_dequeue(&list))) {
const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src;
rcu_read_lock();
if (!src) //如果不是寄生在同一个 物理设备上的macvlan发送,则所有macvlan都 收包
/* frame comes from an external address */
macvlan_broadcast(skb, port, NULL,
MACVLAN_MODE_PRIVATE |
MACVLAN_MODE_VEPA |
MACVLAN_MODE_PASSTHRU|
MACVLAN_MODE_BRIDGE);
else if (src->mode == MACVLAN_MODE_VEPA)//如果是寄生的同一个物理设备上的VEPA类型macvlan发送,则VEPA类型和BRIDGE类型macvlan都受到,private不收
/* flood to everyone except source */
macvlan_broadcast(skb, port, src->dev,
MACVLAN_MODE_VEPA |
MACVLAN_MODE_BRIDGE);
else //如果是寄生在同一个物理设备上的BRIDGE类型macvlan发送,则VEPA类型macvlan受到,private 和 bridge类型不收, bridge不收是因为在发送的时候已经上送给它了。
/*
* flood only to VEPA ports, bridge ports
* already saw the frame on the way out.
*/
macvlan_broadcast(skb, port, src->dev,
MACVLAN_MODE_VEPA);
rcu_read_unlock();
if (src)
dev_put(src->dev);
kfree_skb(skb);
}
}
通过控制广播报文的收发,macvlan保证寄生在同一个物理设备的private设备不能相互收到包。arp协议基于广播报文。
发包流程
static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
struct net_device *dev)
{
unsigned int len = skb->len;
int ret;
struct macvlan_dev *vlan = netdev_priv(dev);
if (unlikely(netpoll_tx_running(dev)))
return macvlan_netpoll_send_skb(vlan, skb);
if (vlan->fwd_priv) {
skb->dev = vlan->lowerdev;
ret = dev_queue_xmit_accel(skb, vlan->fwd_priv);//硬件加速发送
} else {
ret = macvlan_queue_xmit(skb, dev); //没有硬件加速发送方法
}
if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { //统计
struct vlan_pcpu_stats *pcpu_stats;
pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
u64_stats_update_begin(&pcpu_stats->syncp);
pcpu_stats->tx_packets++;
pcpu_stats->tx_bytes += len;
u64_stats_update_end(&pcpu_stats->syncp);
} else {
this_cpu_inc(vlan->pcpu_stats->tx_dropped);
}
return ret;
}
macvlan_queue_xmit函数
static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{
const struct macvlan_dev *vlan = netdev_priv(dev);
const struct macvlan_port *port = vlan->port;
const struct macvlan_dev *dest;
if (vlan->mode == MACVLAN_MODE_BRIDGE) { //macvlan如果是bridge模式
const struct ethhdr *eth = (void *)skb->data;
/* send to other bridge ports directly */
if (is_multicast_ether_addr(eth->h_dest)) { //给其他设备发送广播报文,跟前面的收包对应。
macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE);
goto xmit_world;
}
dest = macvlan_hash_lookup(port, eth->h_dest);//查找是否为发给寄生在同一个物理网卡的其他macvlan设备
if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {//如果是,且接受macvlan也是bridge模式,直接发送由寄生的物理网卡收包。
/* send to lowerdev first for its network taps */
dev_forward_skb(vlan->lowerdev, skb);
return NET_XMIT_SUCCESS;
}
}
xmit_world:
skb->dev = vlan->lowerdev;
return dev_queue_xmit(skb); //通过物理网卡发送数据包。
}
打开网络设备
static int macvlan_open(struct net_device *dev)
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct net_device *lowerdev = vlan->lowerdev;
int err;
if (macvlan_passthru(vlan->port)) {
if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) {
err = dev_set_promiscuity(lowerdev, 1);
if (err < 0)
goto out;
}
goto hash_add;
}
if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD &&
dev->rtnl_link_ops == &macvlan_link_ops) {
vlan->fwd_priv =
lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev);
/* If we get a NULL pointer back, or if we get an error
* then we should just fall through to the non accelerated path
*/
if (IS_ERR_OR_NULL(vlan->fwd_priv)) {
vlan->fwd_priv = NULL;
} else
return 0;
} //如果可以硬件加速 初始化硬件加速。
err = -EBUSY;
if (macvlan_addr_busy(vlan->port, dev->dev_addr))
goto out;
err = dev_uc_add(lowerdev, dev->dev_addr);
if (err < 0)
goto out;
if (dev->flags & IFF_ALLMULTI) {
err = dev_set_allmulti(lowerdev, 1);
if (err < 0)
goto del_unicast;
}
if (dev->flags & IFF_PROMISC) {
err = dev_set_promiscuity(lowerdev, 1); //设置混杂模式收包
if (err < 0)
goto clear_multi;
}
hash_add:
macvlan_hash_add(vlan); //如果不存在硬件加速将开启的vlan添加到hash表中
return 0;
clear_multi:
if (dev->flags & IFF_ALLMULTI)
dev_set_allmulti(lowerdev, -1);
del_unicast:
dev_uc_del(lowerdev, dev->dev_addr);
out:
if (vlan->fwd_priv) {
lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
vlan->fwd_priv);
vlan->fwd_priv = NULL;
}
return err;
}
关闭
static int macvlan_stop(struct net_device *dev)
{
struct macvlan_dev *vlan = netdev_priv(dev);
struct net_device *lowerdev = vlan->lowerdev;
if (vlan->fwd_priv) {
lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
vlan->fwd_priv); //删除硬件加速。
vlan->fwd_priv = NULL;
return 0;
}
dev_uc_unsync(lowerdev, dev);
dev_mc_unsync(lowerdev, dev);
if (macvlan_passthru(vlan->port)) {
if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC))
dev_set_promiscuity(lowerdev, -1);
goto hash_del;
}
if (dev->flags & IFF_ALLMULTI)
dev_set_allmulti(lowerdev, -1);
if (dev->flags & IFF_PROMISC)
dev_set_promiscuity(lowerdev, -1); //关闭混杂模式收包
dev_uc_del(lowerdev, dev->dev_addr); //从 hash表中删除
hash_del:
macvlan_hash_del(vlan, !dev->dismantle);
return 0;
}
macvlan当前有很多应用,在docker 和虚拟化中应用很多,在虚拟化中多数使用macvtap设备,在docker中使用macvlan设备。另外,vrrp等一些需要接受其他mac地址的应用也可以使用macvlan设备。