ipvlan虚拟网卡类似于macvlan虚拟网卡。只是macvlan虚拟网卡 每个虚拟网卡都有自己的mac地址,而 ipvlan虚拟网卡所有的网卡共用一个mac地址,通过不同的ip地址来区分数据包属于哪个网卡。有点类似于内核之前的别名接口。但是它跟别名接口最大的区别在于由于存在虚拟网卡设备,可以将不同的虚拟网卡分配到不同的命令空间,后续我们可以看到它还能够跟tap设备结合,组成 ipvtap虚拟网卡。
类似于 macvlan设备,ipvlan设备也有3中模式
L2模式,在这种模式下,ipvlan虚拟网卡能够收到广播报文,能够自己处理arp请求
L3模式,在这种模式下,ipvlan虚拟网卡不能够接受二层广播报文,arp请求由主网卡代为处理
L3S模式,在这种模式下,ipvlan虚拟网卡不能够接受二层广播报文,arp请求由主网卡代为处理。跟L3模式唯一的区别在于,L3s模式只有在报文是发往本地的时候才修改接收数据包的网卡,否则不修改,这种模式没法和tap设备结合
ipvlan设备的关键数据结构是ipvl_port结构,每个被寄生的物理设备都会有一个这样的设备,所有寄生设备都会被连接到这个结构中
struct ipvl_port {
struct net_device *dev; //寄生的物理网卡
possible_net_t pnet; //命令空间
struct hlist_head hlhead[IPVLAN_HASH_SIZE]; //根据地址查找ipvlan设备
struct list_head ipvlans; //所有的ipvlan设备
u16 mode;
u16 dev_id_start;
struct work_struct wq; //广播地址发送工作进程
struct sk_buff_head backlog; //广播包缓冲队列
int count; //寄生的ipvlan设备数量
struct ida ida;
};
每个ipvlan设备私有数据结构
struct ipvl_dev {
struct net_device *dev; //ipvlan设备网卡
struct list_head pnode; //链表头
struct ipvl_port *port; //属于哪个port
struct net_device *phy_dev; //寄生的物理网卡设备
struct list_head addrs; //这个ipvlan设备对应的ip地址列表
struct ipvl_pcpu_stats __percpu *pcpu_stats; //统计技术
DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE);
netdev_features_t sfeatures;
u32 msg_enable;
};
模块注册
static int __init ipvlan_init_module(void)
{
int err;
ipvlan_init_secret();
register_netdevice_notifier(&ipvlan_notifier_block); //注册网卡启停事件
register_inet6addr_notifier(&ipvlan_addr6_notifier_block); //注册ipv6地址加减事件
register_inet6addr_validator_notifier(
&ipvlan_addr6_vtor_notifier_block); //注册ipv6地址验证事件
register_inetaddr_notifier(&ipvlan_addr4_notifier_block); //注册ipv4地址加减事件
register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block); //注册ipv4地址验证事件
err = register_pernet_subsys(&ipvlan_net_ops); //注册网络命名空间退出和初始化结构
if (err < 0)
goto error;
err = ipvlan_link_register(&ipvlan_link_ops); //注册netlink创建网卡结构
if (err < 0) {
unregister_pernet_subsys(&ipvlan_net_ops);
goto error;
}
return 0;
error:
unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
unregister_inetaddr_validator_notifier(
&ipvlan_addr4_vtor_notifier_block);
unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
unregister_inet6addr_validator_notifier(
&ipvlan_addr6_vtor_notifier_block);
unregister_netdevice_notifier(&ipvlan_notifier_block);
return err;
}
由于ipvlan需要根据ip地址分流,因此当虚拟网卡添加或者删除ip时,都需要在分流hash表中做相应的操作。因此需要注册 ip地址变化事件处理函数
int ipvlan_link_new(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
{
struct ipvl_dev *ipvlan = netdev_priv(dev);
struct ipvl_port *port;
struct net_device *phy_dev;
int err;
u16 mode = IPVLAN_MODE_L3;
bool create = false;
if (!tb[IFLA_LINK])
return -EINVAL;
phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); //获取寄生的物理设备
if (!phy_dev)
return -ENODEV;
if (netif_is_ipvlan(phy_dev)) { //如果嵌套获取最底层的物理设备
struct ipvl_dev *tmp = netdev_priv(phy_dev);
phy_dev = tmp->phy_dev;
} else if (!netif_is_ipvlan_port(phy_dev)) {//如果物理设备还没被寄生过创建port
err = ipvlan_port_create(phy_dev);
if (err < 0)
return err;
create = true;
}
if (data && data[IFLA_IPVLAN_MODE])
mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); //获取创建的ipvlan设备类型
port = ipvlan_port_get_rtnl(phy_dev);
ipvlan->phy_dev = phy_dev;
ipvlan->dev = dev;
ipvlan->port = port;
ipvlan->sfeatures = IPVLAN_FEATURES;
ipvlan_adjust_mtu(ipvlan, phy_dev);
INIT_LIST_HEAD(&ipvlan->addrs);
/* If the port-id base is at the MAX value, then wrap it around and
* begin from 0x1 again. This may be due to a busy system where lots
* of slaves are getting created and deleted.
*/
if (port->dev_id_start == 0xFFFE)
port->dev_id_start = 0x1;
/* Since L2 address is shared among all IPvlan slaves including
* master, use unique 16 bit dev-ids to diffentiate among them.
* Assign IDs between 0x1 and 0xFFFE (used by the master) to each
* slave link [see addrconf_ifid_eui48()].
*/
err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE,
GFP_KERNEL);
if (err < 0)
err = ida_simple_get(&port->ida, 0x1, port->dev_id_start,
GFP_KERNEL);
if (err < 0)
goto destroy_ipvlan_port;
dev->dev_id = err;
/* Increment id-base to the next slot for the future assignment */
port->dev_id_start = err + 1;
/* TODO Probably put random address here to be presented to the
* world but keep using the physical-dev address for the outgoing
* packets.
*/
memcpy(dev->dev_addr, phy_dev->dev_addr, ETH_ALEN); //设置mac地址
dev->priv_flags |= IFF_IPVLAN_SLAVE;
err = register_netdevice(dev);
if (err < 0)
goto remove_ida;
err = netdev_upper_dev_link(phy_dev, dev);
if (err) {
goto unregister_netdev;
}
err = ipvlan_set_port_mode(port, mode); //设置ipvlan模式,所有寄居在同一个物理设备上的ipvlan设备必须是同一个模式
if (err) {
goto unlink_netdev;
}
list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);//添加ipvlan到port中
netif_stacked_transfer_operstate(phy_dev, dev);
return 0;
unlink_netdev:
netdev_upper_dev_unlink(phy_dev, dev);
unregister_netdev:
unregister_netdevice(dev);
remove_ida:
ida_simple_remove(&port->ida, dev->dev_id);
destroy_ipvlan_port:
if (create)
ipvlan_port_destroy(phy_dev);
return err;
}
打开设备 函数
static int ipvlan_open(struct net_device *dev)
{
struct ipvl_dev *ipvlan = netdev_priv(dev);
struct net_device *phy_dev = ipvlan->phy_dev;
struct ipvl_addr *addr;
if (ipvlan->port->mode == IPVLAN_MODE_L3 ||
ipvlan->port->mode == IPVLAN_MODE_L3S)
dev->flags |= IFF_NOARP;
else
dev->flags &= ~IFF_NOARP; //只L2模式下 需要回复 arp报文
list_for_each_entry(addr, &ipvlan->addrs, anode)
ipvlan_ht_addr_add(ipvlan, addr); //将这个设备关联的ip地址添加到查找hash表中。
return dev_uc_add(phy_dev, phy_dev->dev_addr);
}
数据流 接受数据包
rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
{
struct sk_buff *skb = *pskb;
struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev);
if (!port)
return RX_HANDLER_PASS;
switch (port->mode) {
case IPVLAN_MODE_L2:
return ipvlan_handle_mode_l2(pskb, port); //L2 模式收包函数
case IPVLAN_MODE_L3:
return ipvlan_handle_mode_l3(pskb, port); //L3模式收包函数
case IPVLAN_MODE_L3S:
return RX_HANDLER_PASS; //L3s模式,在走协议栈时不改变dev,只有在最后local_in链表中改变
}
/* Should not reach here */
WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n",
port->mode);
kfree_skb(skb);
return RX_HANDLER_CONSUMED;
}
static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb,
struct ipvl_port *port)
{
struct sk_buff *skb = *pskb;
struct ethhdr *eth = eth_hdr(skb);
rx_handler_result_t ret = RX_HANDLER_PASS;
void *lyr3h;
int addr_type;
if (is_multicast_ether_addr(eth->h_dest)) { //广播地址处理流程
if (ipvlan_external_frame(skb, port)) {
struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
/* External frames are queued for device local
* distribution, but a copy is given to master
* straight away to avoid sending duplicates later
* when work-queue processes this frame. This is
* achieved by returning RX_HANDLER_PASS.
*/
if (nskb) {
ipvlan_skb_crossing_ns(nskb, NULL);
ipvlan_multicast_enqueue(port, nskb, false);//添加到广播数据包队里中,等待广播地址处理工作队列处理。
}
}
} else {
struct ipvl_addr *addr;
lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
if (!lyr3h)
return ret;
addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); //在ip地址hash表中通过地址找到收包设备,
if (addr)
ret = ipvlan_rcv_frame(addr, pskb, false); //修改数据包的收包设备为找到的ipvlan设备 返回RX_HANDLER_ANOTHER
}
return ret;
}
static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb,
struct ipvl_port *port)
{
void *lyr3h;
int addr_type;
struct ipvl_addr *addr;
struct sk_buff *skb = *pskb;
rx_handler_result_t ret = RX_HANDLER_PASS;
lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
if (!lyr3h)
goto out;
addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); //通过地址找到收包设备
if (addr)
ret = ipvlan_rcv_frame(addr, pskb, false); //改变数据包的收包设备,然后返回RX_HANDLER_ANOTHER
out:
return ret;
}
数据流 发包流程
static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb,
struct net_device *dev)
{
const struct ipvl_dev *ipvlan = netdev_priv(dev);
int skblen = skb->len;
int ret;
ret = ipvlan_queue_xmit(skb, dev); //发送数据包
if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
struct ipvl_pcpu_stats *pcptr;
pcptr = this_cpu_ptr(ipvlan->pcpu_stats);
u64_stats_update_begin(&pcptr->syncp);
pcptr->tx_pkts++;
pcptr->tx_bytes += skblen;
u64_stats_update_end(&pcptr->syncp);
} else {
this_cpu_inc(ipvlan->pcpu_stats->tx_drps);
}
return ret;
}
int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ipvl_dev *ipvlan = netdev_priv(dev);
struct ipvl_port *port = ipvlan_port_get_rcu_bh(ipvlan->phy_dev);
if (!port)
goto out;
if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
goto out;
switch(port->mode) {
case IPVLAN_MODE_L2:
return ipvlan_xmit_mode_l2(skb, dev); //L2模式发包流程
case IPVLAN_MODE_L3:
case IPVLAN_MODE_L3S:
return ipvlan_xmit_mode_l3(skb, dev); //L3模式发包流程
}
/* Should not reach here */
WARN_ONCE(true, "ipvlan_queue_xmit() called for mode = [%hx]\n",
port->mode);
out:
kfree_skb(skb);
return NET_XMIT_DROP;
}
static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
{
const struct ipvl_dev *ipvlan = netdev_priv(dev);
struct ethhdr *eth = eth_hdr(skb);
struct ipvl_addr *addr;
void *lyr3h;
int addr_type;
if (ether_addr_equal(eth->h_dest, eth->h_source)) { //发送给自己的数据包
lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
if (lyr3h) {
addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
if (addr)
return ipvlan_rcv_frame(addr, &skb, true); //直接通过ipvlan设备上送
}
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
return NET_XMIT_DROP;
/* Packet definitely does not belong to any of the
* virtual devices, but the dest is local. So forward
* the skb for the main-dev. At the RX side we just return
* RX_PASS for it to be processed further on the stack.
*/
return dev_forward_skb(ipvlan->phy_dev, skb); //通过物理设备上送
} else if (is_multicast_ether_addr(eth->h_dest)) { //广播报文
ipvlan_skb_crossing_ns(skb, NULL);
ipvlan_multicast_enqueue(ipvlan->port, skb, true); //添加到广播报文缓冲队列中
return NET_XMIT_SUCCESS;
}
ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
return dev_queue_xmit(skb); //通过物理设备发送出去
}
static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
{
const struct ipvl_dev *ipvlan = netdev_priv(dev);
void *lyr3h;
struct ipvl_addr *addr;
int addr_type;
lyr3h = ipvlan_get_L3_hdr(skb, &addr_type);
if (!lyr3h)
goto out;
addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
if (addr)
return ipvlan_rcv_frame(addr, &skb, true); //通过ipvlan设备发送
out:
ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
return ipvlan_process_outbound(skb); //通过查找路由,发送数据包
}
static int ipvlan_process_outbound(struct sk_buff *skb)
{
struct ethhdr *ethh = eth_hdr(skb);
int ret = NET_XMIT_DROP;
/* In this mode we dont care about multicast and broadcast traffic */
if (is_multicast_ether_addr(ethh->h_dest)) { //广播地址直接丢弃,不处理广播
pr_warn_ratelimited("Dropped {multi|broad}cast of type= [%x]\n",
ntohs(skb->protocol));
kfree_skb(skb);
goto out;
}
/* The ipvlan is a pseudo-L2 device, so the packets that we receive
* will have L2; which need to discarded and processed further
* in the net-ns of the main-device.
*/
if (skb_mac_header_was_set(skb)) {
skb_pull(skb, sizeof(*ethh));
skb->mac_header = (typeof(skb->mac_header))~0U;
skb_reset_network_header(skb);
}
if (skb->protocol == htons(ETH_P_IPV6))
ret = ipvlan_process_v6_outbound(skb); //通过ipv6的outbound发送
else if (skb->protocol == htons(ETH_P_IP))
ret = ipvlan_process_v4_outbound(skb); //通过ipv4的outbound发送
else {
pr_warn_ratelimited("Dropped outbound packet type=%x\n",
ntohs(skb->protocol));
kfree_skb(skb);
}
out:
return ret;
}
static int ipvlan_process_v4_outbound(struct sk_buff *skb)
{
const struct iphdr *ip4h = ip_hdr(skb);
struct net_device *dev = skb->dev;
struct net *net = dev_net(dev);
struct rtable *rt;
int err, ret = NET_XMIT_DROP;
struct flowi4 fl4 = {
.flowi4_oif = dev->ifindex,
.flowi4_tos = RT_TOS(ip4h->tos),
.flowi4_flags = FLOWI_FLAG_ANYSRC,
.daddr = ip4h->daddr,
.saddr = ip4h->saddr,
};
rt = ip_route_output_flow(net, &fl4, NULL); //查找路由
if (IS_ERR(rt))
goto err;
if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
ip_rt_put(rt);
goto err;
}
skb_dst_set(skb, &rt->dst);
err = ip_local_out(net, skb->sk, skb); //通过ip_local_out发送数据包,走路由流程发送
if (unlikely(net_xmit_eval(err)))
dev->stats.tx_errors++;
else
ret = NET_XMIT_SUCCESS;
goto out;
err:
dev->stats.tx_errors++;
kfree_skb(skb);
out:
return ret;
}
ipvlan设备相当于macvlan设备的扩充,L3模式在发送数据包时 不一定会走ipvlan宿主设备,可能通过路由走其他设备发送出去。