- macvtap设备和macvlan设备的区别
macvtap设备是基于macvlan设备的。即每次创建一个macvtap设备必定会创建一个macvlan设备
macvtap设备和macvlan设备最大的区别是数据包的来源和去向
macvtap设备会关联一个字符设备,通过macvlan设备收上来的数据包直接扔到 字符设备的缓冲区里面,等待字符设备读取。字符设备发送下来的包直接通过关联的macvlan设备发送出去。
macvtap 设备收到的数据包直接上送到用户态,用户态程序通过macvtap设备关联的字符设备直接发送数据包。
macvlan 设备收到的数据包上送到协议栈处理。然后协议栈通过发包接口调用macvlan设备发送数据包 -
macvtap设备和tun/tap设备的区别
macvtap设备是将macvlan网卡设备收到的包通过字符设备上送到用户态,用户态写字符设备通过macvtap设备将数据从关联的macvlan设备发送出去。
tun/tap设备协议栈通过tun/tap设备的虚拟网卡,将发送的数据包通过字符设备上送到用户态,用户态写字符设备通过tun/tap设备的虚拟网卡,将数据包上送到协议栈。
总结起来就是,tun/tap设备将虚拟网卡发送的数据包上送到用户态, macvtap设备将虚拟macvlan网卡接受的数据包上送到用户态。tun/tap设备将用户下发的数据包上送到协议栈,macvtap设备将用户态下发的数据包,通过虚拟macvlan设备发送出去。
私有数据结构,这个结构会被当做创建的macvlan网卡设备的私有数据。
struct macvtap_dev {
struct macvlan_dev vlan; //macvlan设备的私有结构 上结讨论过
struct tap_dev tap; //macvtap的私有结构。
};
struct tap_dev {
struct net_device *dev;
u16 flags;
/* This array tracks active taps. */
struct tap_queue __rcu *taps[MAX_TAP_QUEUES]; //收包文件队列
/* This list tracks all taps (both enabled and disabled) */
struct list_head queue_list;
int numvtaps;
int numqueues;
netdev_features_t tap_features;
int minor;
void (*update_features)(struct tap_dev *tap, netdev_features_t features);
void (*count_tx_dropped)(struct tap_dev *tap);
void (*count_rx_dropped)(struct tap_dev *tap);
};
struct tap_queue {
struct sock sk;
struct socket sock;
struct socket_wq wq;
int vnet_hdr_sz;
struct tap_dev __rcu *tap;
struct file *file;
unsigned int flags;
u16 queue_index;
bool enabled;
struct list_head next;
struct skb_array skb_array; //数据包缓冲区
};
模块注册
static int macvtap_init(void)
{
int err;
err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap");//注册字符设备需要使用的 major 和minor
if (err)
goto out1;
err = class_register(&macvtap_class);//在sys文件系统中注册macvtap class
if (err)
goto out2;
err = register_netdevice_notifier(&macvtap_notifier_block);//注册网卡设备事件消息回调函数
if (err)
goto out3;
err = macvlan_link_register(&macvtap_link_ops);//注册macvtap netlink 结构,供ip link 调用创建macvtap设备
if (err)
goto out4;
return 0;
out4:
unregister_netdevice_notifier(&macvtap_notifier_block);
out3:
class_unregister(&macvtap_class);
out2:
tap_destroy_cdev(macvtap_major, &macvtap_cdev);
out1:
return err;
}
macvtap设备创建
static void macvtap_setup(struct net_device *dev)
{
macvlan_common_setup(dev); //调用macvlan初始化,网卡设备
dev->tx_queue_len = TUN_READQ_SIZE;
}
static int macvtap_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
{
struct macvtap_dev *vlantap = netdev_priv(dev);
int err;
INIT_LIST_HEAD(&vlantap->tap.queue_list);
/* Since macvlan supports all offloads by default, make
* tap support all offloads also.
*/
vlantap->tap.tap_features = TUN_OFFLOADS;
/* Register callbacks for rx/tx drops accounting and updating
* net_device features
*/
vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped;
vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped;
vlantap->tap.update_features = macvtap_update_features;
err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);//macvtap设备rx_handler函数
if (err)
return err;
/* Don't put anything that may fail after macvlan_common_newlink
* because we can't undo what it does.
*/
err = macvlan_common_newlink(src_net, dev, tb, data);//调用macvlan注册新网卡设备
if (err) {
netdev_rx_handler_unregister(dev);
return err;
}
vlantap->tap.dev = vlantap->vlan.dev;
return 0;
}
网卡注册时会触发 网卡注册事件,最后回调macvtap_device_event函数。
static int macvtap_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct macvtap_dev *vlantap;
struct device *classdev;
dev_t devt;
int err;
char tap_name[IFNAMSIZ];
if (dev->rtnl_link_ops != &macvtap_link_ops) //如果不是macvtap网卡设备注册不管
return NOTIFY_DONE;
snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
vlantap = netdev_priv(dev);
switch (event) {
case NETDEV_REGISTER:
/* Create the device node here after the network device has
* been registered but before register_netdevice has
* finished running.
*/
err = tap_get_minor(macvtap_major, &vlantap->tap);//获取即将创建的字符设备的minor号,并将字符设备和网卡设备给关联
if (err)
return notifier_from_errno(err);
devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
classdev = device_create(&macvtap_class, &dev->dev, devt,
dev, tap_name);//创建macvtap对应的字符设备
if (IS_ERR(classdev)) {
tap_free_minor(macvtap_major, &vlantap->tap);
return notifier_from_errno(PTR_ERR(classdev));
}
err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
tap_name);
if (err)
return notifier_from_errno(err);
break;
case NETDEV_UNREGISTER:
/* vlan->minor == 0 if NETDEV_REGISTER above failed */
if (vlantap->tap.minor == 0)
break;
sysfs_remove_link(&dev->dev.kobj, tap_name);
devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
device_destroy(&macvtap_class, devt);
tap_free_minor(macvtap_major, &vlantap->tap);
break;
case NETDEV_CHANGE_TX_QUEUE_LEN:
if (tap_queue_resize(&vlantap->tap))
return NOTIFY_BAD;
break;
}
return NOTIFY_DONE;
}
打开字符设备,用于用户态通讯
static int tap_open(struct inode *inode, struct file *file)
{
struct net *net = current->nsproxy->net_ns;
struct tap_dev *tap;
struct tap_queue *q;
int err = -ENODEV;
rtnl_lock();
tap = dev_get_by_tap_file(imajor(inode), iminor(inode)); //通过major和minor号获取对应的tap
if (!tap)
goto err;
err = -ENOMEM;
q = (struct tap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
&tap_proto, 0); //创建tap_queue每次打开字符设备多一个队列
if (!q)
goto err;
RCU_INIT_POINTER(q->sock.wq, &q->wq);
init_waitqueue_head(&q->wq.wait);
q->sock.type = SOCK_RAW;
q->sock.state = SS_CONNECTED;
q->sock.file = file;
q->sock.ops = &tap_socket_ops;
sock_init_data(&q->sock, &q->sk);
q->sk.sk_write_space = tap_sock_write_space;
q->sk.sk_destruct = tap_sock_destruct;
q->flags = IFF_VNET_HDR | IFF_NO_PI | IFF_TAP;
q->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
/*
* so far only KVM virtio_net uses tap, enable zero copy between
* guest kernel and host kernel when lower device supports zerocopy
*
* The macvlan supports zerocopy iff the lower device supports zero
* copy so we don't have to look at the lower device directly.
*/
if ((tap->dev->features & NETIF_F_HIGHDMA) && (tap->dev->features & NETIF_F_SG))
sock_set_flag(&q->sk, SOCK_ZEROCOPY);
err = -ENOMEM;
if (skb_array_init(&q->skb_array, tap->dev->tx_queue_len, GFP_KERNEL))
goto err_array;
err = tap_set_queue(tap, file, q); //将队列和tap设备关联。
if (err)
goto err_queue;
dev_put(tap->dev);
rtnl_unlock();
return err;
err_queue:
skb_array_cleanup(&q->skb_array);
err_array:
sock_put(&q->sk);
err:
if (tap)
dev_put(tap->dev);
rtnl_unlock();
return err;
}
至此,数据通路已经完全打开。
接受数据流程
在netif_receive_skb函数中调用 tap_handle_frame函数
rx_handler_result_t tap_handle_frame(struct sk_buff **pskb)
{
struct sk_buff *skb = *pskb;
struct net_device *dev = skb->dev;
struct tap_dev *tap;
struct tap_queue *q;
netdev_features_t features = TAP_FEATURES;
tap = tap_dev_get_rcu(dev);//获取设备关联的tap结构
if (!tap)
return RX_HANDLER_PASS;
q = tap_get_queue(tap, skb);//根据hash计算使用哪个缓冲队列
if (!q)
return RX_HANDLER_PASS;
if (__skb_array_full(&q->skb_array))
goto drop;
skb_push(skb, ETH_HLEN);
/* Apply the forward feature mask so that we perform segmentation
* according to users wishes. This only works if VNET_HDR is
* enabled.
*/
if (q->flags & IFF_VNET_HDR)
features |= tap->tap_features;
if (netif_needs_gso(skb, features)) {
struct sk_buff *segs = __skb_gso_segment(skb, features, false);
if (IS_ERR(segs))
goto drop;
if (!segs) {
if (skb_array_produce(&q->skb_array, skb)) //将数据放入缓冲队列中
goto drop;
goto wake_up;
}
consume_skb(skb);
while (segs) {
struct sk_buff *nskb = segs->next;
segs->next = NULL;
if (skb_array_produce(&q->skb_array, segs)) {
kfree_skb(segs);
kfree_skb_list(nskb);
break;
}
segs = nskb;
}
} else {
/* If we receive a partial checksum and the tap side
* doesn't support checksum offload, compute the checksum.
* Note: it doesn't matter which checksum feature to
* check, we either support them all or none.
*/
if (skb->ip_summed == CHECKSUM_PARTIAL &&
!(features & NETIF_F_CSUM_MASK) &&
skb_checksum_help(skb))
goto drop;
if (skb_array_produce(&q->skb_array, skb)) //将数据放入缓冲队列中
goto drop;
}
wake_up:
wake_up_interruptible_poll(sk_sleep(&q->sk), POLLIN | POLLRDNORM | POLLRDBAND); //唤醒等待读数据的进程
return RX_HANDLER_CONSUMED;
drop:
/* Count errors/drops only here, thus don't care about args. */
if (tap->count_rx_dropped)
tap->count_rx_dropped(tap);
kfree_skb(skb);
return RX_HANDLER_CONSUMED;
}
用户态调用tap_read_iter读取数据,tap_do_read 函数跟 tun/tap中函数类似,都是判断缓冲区是否有数据,如果没有数据就等待,如果有数据返回数据
static ssize_t tap_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct tap_queue *q = file->private_data;
ssize_t len = iov_iter_count(to), ret;
ret = tap_do_read(q, to, file->f_flags & O_NONBLOCK, NULL); //从缓冲区获取数据
ret = min_t(ssize_t, ret, len);
if (ret > 0)
iocb->ki_pos = ret;
return ret;
}
发送数据流程
static ssize_t tap_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct tap_queue *q = file->private_data;
return tap_get_user(q, NULL, from, file->f_flags & O_NONBLOCK); 发送数据
}
static ssize_t tap_get_user(struct tap_queue *q, struct msghdr *m,
struct iov_iter *from, int noblock)
{
int good_linear = SKB_MAX_HEAD(TAP_RESERVE);
struct sk_buff *skb;
struct tap_dev *tap;
unsigned long total_len = iov_iter_count(from);
unsigned long len = total_len;
int err;
struct virtio_net_hdr vnet_hdr = { 0 };
int vnet_hdr_len = 0;
int copylen = 0;
int depth;
bool zerocopy = false;
size_t linear;
if (q->flags & IFF_VNET_HDR) {
vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
err = -EINVAL;
if (len < vnet_hdr_len)
goto err;
len -= vnet_hdr_len;
err = -EFAULT;
if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), from))
goto err;
iov_iter_advance(from, vnet_hdr_len - sizeof(vnet_hdr));
if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
tap16_to_cpu(q, vnet_hdr.csum_start) +
tap16_to_cpu(q, vnet_hdr.csum_offset) + 2 >
tap16_to_cpu(q, vnet_hdr.hdr_len))
vnet_hdr.hdr_len = cpu_to_tap16(q,
tap16_to_cpu(q, vnet_hdr.csum_start) +
tap16_to_cpu(q, vnet_hdr.csum_offset) + 2);
err = -EINVAL;
if (tap16_to_cpu(q, vnet_hdr.hdr_len) > len)
goto err;
}
err = -EINVAL;
if (unlikely(len < ETH_HLEN))
goto err;
if (m && m->msg_control && sock_flag(&q->sk, SOCK_ZEROCOPY)) {
struct iov_iter i;
copylen = vnet_hdr.hdr_len ?
tap16_to_cpu(q, vnet_hdr.hdr_len) : GOODCOPY_LEN;
if (copylen > good_linear)
copylen = good_linear;
else if (copylen < ETH_HLEN)
copylen = ETH_HLEN;
linear = copylen;
i = *from;
iov_iter_advance(&i, copylen);
if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
zerocopy = true;
}
if (!zerocopy) {
copylen = len;
linear = tap16_to_cpu(q, vnet_hdr.hdr_len);
if (linear > good_linear)
linear = good_linear;
else if (linear < ETH_HLEN)
linear = ETH_HLEN;
}
skb = tap_alloc_skb(&q->sk, TAP_RESERVE, copylen,
linear, noblock, &err);//分配skb
if (!skb)
goto err;
if (zerocopy)
err = zerocopy_sg_from_iter(skb, from);
else
err = skb_copy_datagram_from_iter(skb, 0, from, len); //获取数据
if (err)
goto err_kfree;
skb_set_network_header(skb, ETH_HLEN);
skb_reset_mac_header(skb); //设置
skb->protocol = eth_hdr(skb)->h_proto;
if (vnet_hdr_len) {
err = virtio_net_hdr_to_skb(skb, &vnet_hdr,
tap_is_little_endian(q));
if (err)
goto err_kfree;
}
skb_probe_transport_header(skb, ETH_HLEN);
/* Move network header to the right position for VLAN tagged packets */
if ((skb->protocol == htons(ETH_P_8021Q) ||
skb->protocol == htons(ETH_P_8021AD)) &&
__vlan_get_protocol(skb, skb->protocol, &depth) != 0)
skb_set_network_header(skb, depth);
rcu_read_lock();
tap = rcu_dereference(q->tap);
/* copy skb_ubuf_info for callback when skb has no error */
if (zerocopy) {
skb_shinfo(skb)->destructor_arg = m->msg_control;
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
} else if (m && m->msg_control) {
struct ubuf_info *uarg = m->msg_control;
uarg->callback(uarg, false);
}
if (tap) {
skb->dev = tap->dev; //设置发送使用的网卡
dev_queue_xmit(skb); //发送数据包,最后调用macvlan的发送接口发送数据包
} else {
kfree_skb(skb);
}
rcu_read_unlock();
return total_len;
err_kfree:
kfree_skb(skb);
err:
rcu_read_lock();
tap = rcu_dereference(q->tap);
if (tap && tap->count_tx_dropped)
tap->count_tx_dropped(tap);
rcu_read_unlock();
return err;
}
至此, macvtap数据发送流程结束。
macvtap在vpn和 虚拟化中应用很广。它主要功能是将macvlan设备获取的数据包直接上送到用户态而不是协议栈。