每个 CPU 上的调度器会调度执行不同的线程,例如处理 OOM 的线程、处理 swap 的线程,以及我们的软中断处理线程,每个线程分配一定的时间片。如果此时调度到的是 ksoftirqd 线程,并且有 pending 的软中断等待处理, 那 thread_fn() 执行的就是 run_ksoftirqd()。一旦软中断代码判断出有 softirq 处于 pending 状态,就会开始处理, 执行 net_rx_action,从 ring buffer 收包。
net_rx_action函数中会执行网卡驱动注册的 poll() 方法,那么从这里开始看。
1 注册 poll()方法
最开始是一个宏
#define RTL_NAPI_CONFIG(ndev, priv, function, weight) netif_napi_add(ndev, &priv->napi, function, weight)
#define RTL_NAPI_CONFIG(ndev, priv, function, weight) netif_napi_add(ndev, &priv->napi, function, weight)
然后找到这个宏,在初始化那一篇中提到过。从这里可以知道,rx的poll方法是rtl8125_poll_msix_rx函数。然后给了一个weight,这个值是64。这个参数控制了网卡一次 poll() 时,最多允许处理的包数。
static void rtl8125_init_napi(struct rtl8125_private *tp)
{
int i;
for (i=0; i<tp->irq_nvecs; i++) {
struct r8125_napi *r8125napi = &tp->r8125napi[i];
#ifdef CONFIG_R8125_NAPI
int (*poll)(struct napi_struct *, int);
if (tp->features & RTL_FEATURE_MSIX &&
tp->HwCurrIsrVer == 2) {
if (i < R8125_MAX_RX_QUEUES_VEC_V3)
poll = rtl8125_poll_msix_rx;
else if (i == 16 || i == 18)
poll = rtl8125_poll_msix_tx;
else
poll = rtl8125_poll_msix_other;
} else {
poll = rtl8125_poll;
}
//这里注册了napi的poll方法
RTL_NAPI_CONFIG(tp->dev, r8125napi, poll, R8125_NAPI_WEIGHT);
#endif
r8125napi->priv = tp;
r8125napi->index = i;
}
}
static void rtl8125_init_napi(struct rtl8125_private *tp)
{
int i;
for (i=0; i<tp->irq_nvecs; i++) {
struct r8125_napi *r8125napi = &tp->r8125napi[i];
#ifdef CONFIG_R8125_NAPI
int (*poll)(struct napi_struct *, int);
if (tp->features & RTL_FEATURE_MSIX &&
tp->HwCurrIsrVer == 2) {
if (i < R8125_MAX_RX_QUEUES_VEC_V3)
poll = rtl8125_poll_msix_rx;
else if (i == 16 || i == 18)
poll = rtl8125_poll_msix_tx;
else
poll = rtl8125_poll_msix_other;
} else {
poll = rtl8125_poll;
}
//这里注册了napi的poll方法
RTL_NAPI_CONFIG(tp->dev, r8125napi, poll, R8125_NAPI_WEIGHT);
#endif
r8125napi->priv = tp;
r8125napi->index = i;
}
}
2 rtl8125_poll_msix_rx
net_rx_action会调用rtl8125_poll_msix_rx。
static int rtl8125_poll_msix_rx(napi_ptr napi, napi_budget budget)
{
struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi);
struct rtl8125_private *tp = r8125napi->priv;
RTL_GET_NETDEV(tp)
unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev);
unsigned int work_done = 0;
const int message_id = r8125napi->index;
//收包的函数,最终return work_done也就是完成了多少次收包。
work_done += rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[message_id], budget);
//这里的budget就是上面的weight
RTL_NAPI_QUOTA_UPDATE(dev, work_done, budget);
//一般都是都是work_done < work_to_do
if (work_done < work_to_do) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
//napi_complete_done
if (RTL_NETIF_RX_COMPLETE(dev, napi, work_done) == FALSE) return RTL_NAPI_RETURN_VALUE;
#else
RTL_NETIF_RX_COMPLETE(dev, napi, work_done);
#endif
/*
* 20040426: the barrier is not strictly required but the
* behavior of the irq handler could be less predictable
* without it. Btw, the lack of flush for the posted pci
* write is safe - FR
*/
smp_wmb();
//enable对应的interrupt
rtl8125_enable_hw_interrupt_v2(tp, message_id);
}
return RTL_NAPI_RETURN_VALUE;
}
static int rtl8125_poll_msix_rx(napi_ptr napi, napi_budget budget)
{
struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi);
struct rtl8125_private *tp = r8125napi->priv;
RTL_GET_NETDEV(tp)
unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev);
unsigned int work_done = 0;
const int message_id = r8125napi->index;
//收包的函数,最终return work_done也就是完成了多少次收包。
work_done += rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[message_id], budget);
//这里的budget就是上面的weight
RTL_NAPI_QUOTA_UPDATE(dev, work_done, budget);
//一般都是都是work_done < work_to_do
if (work_done < work_to_do) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
//napi_complete_done
if (RTL_NETIF_RX_COMPLETE(dev, napi, work_done) == FALSE) return RTL_NAPI_RETURN_VALUE;
#else
RTL_NETIF_RX_COMPLETE(dev, napi, work_done);
#endif
/*
* 20040426: the barrier is not strictly required but the
* behavior of the irq handler could be less predictable
* without it. Btw, the lack of flush for the posted pci
* write is safe - FR
*/
smp_wmb();
//enable对应的interrupt
rtl8125_enable_hw_interrupt_v2(tp, message_id);
}
return RTL_NAPI_RETURN_VALUE;
}
3 rtl8125_rx_interrupt
rtl8125_rx_interrupt是具体的收包函数,最终返回收包数目。
static int
rtl8125_rx_interrupt(struct net_device *dev,
struct rtl8125_private *tp,
struct rtl8125_rx_ring *ring,
napi_budget budget)
{
unsigned int cur_rx, rx_left;
unsigned int delta, count = 0;
unsigned int entry;
struct RxDesc *desc;
u32 status;
u32 rx_quota;
u64 rx_buf_phy_addr;
u32 ring_index = ring->index;
assert(dev != NULL);
assert(tp != NULL);
if (ring->RxDescArray == NULL)
goto rx_out;
rx_quota = RTL_RX_QUOTA(budget);
//cur_rx是当前使用的desc,其前面的都是clean的
cur_rx = ring->cur_rx;
entry = cur_rx % ring->num_rx_desc;
//desc对应的地址,RxDescBase + cur_rx * RxDescLength
desc = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry);
//rx_left就是指还可以使用的desc数目
//dirty_rx是当前清理到的desc,其前面的都是dirty的,截止到cur_rx
rx_left = ring->num_rx_desc + ring->dirty_rx - cur_rx;
//这里把rx_left和budget取了一个最小值
rx_left = rtl8125_rx_quota(rx_left, (u32)rx_quota);
for (; rx_left > 0; rx_left--) {
status = le32_to_cpu(rtl8125_rx_desc_opts1(tp, desc));
//DescOwn = 1代表没有收到包,desc还是owned by nic
if (status & DescOwn)
break;
rmb();
//这里是error,一般不走这里
if (unlikely(rtl8125_check_rx_desc_error(dev, tp, status) < 0)) {
if (netif_msg_rx_err(tp)) {
printk(KERN_INFO
"%s: Rx ERROR. status = %08x\n",
dev->name, status);
}
RTLDEV->stats.rx_errors++;
if (dev->features & NETIF_F_RXALL)
goto process_pkt;
rtl8125_mark_to_asic(tp, desc, tp->rx_buf_sz);
} else {
struct sk_buff *skb;
int pkt_size;
process_pkt:
//得到packet size
pkt_size = status & 0x00003fff;
if (likely(!(dev->features & NETIF_F_RXFCS)))
pkt_size -= ETH_FCS_LEN;
/*
* The driver does not support incoming fragmented
* frames. They are seen as a symptom of over-mtu
* sized frames.
*/
if (unlikely(rtl8125_fragmented_frame(tp, status)) ||
unlikely(pkt_size > tp->rx_buf_sz)) {
RTLDEV->stats.rx_dropped++;
RTLDEV->stats.rx_length_errors++;
rtl8125_mark_to_asic(tp, desc, tp->rx_buf_sz);
continue;
}
//当前desc中的skb
skb = ring->Rx_skbuff[entry];
if (!skb)
break;
//phy addr for dma
rx_buf_phy_addr = ring->RxDescPhyAddr[entry];
//在DMA把数据从device搬到DDR后,在cpu 访问DDR之前调用,目的是为了让cpu看到最新的数据
dma_sync_single_for_cpu(tp_to_dev(tp),
rx_buf_phy_addr, tp->rx_buf_sz,
DMA_FROM_DEVICE);
if (rtl8125_try_rx_copy(tp, ring, &skb, pkt_size,
desc, tp->rx_buf_sz)) {
ring->Rx_skbuff[entry] = NULL;
dma_unmap_single(tp_to_dev(tp), rx_buf_phy_addr,
tp->rx_buf_sz, DMA_FROM_DEVICE);
} else {
// 在DMA把数据从DDR搬到device之前调用,目的是为了让device看到最新的数据
dma_sync_single_for_device(tp_to_dev(tp), rx_buf_phy_addr,
tp->rx_buf_sz, DMA_FROM_DEVICE);
}
#ifdef ENABLE_RSS_SUPPORT
rtl8125_rx_hash(tp, (struct RxDescV3 *)desc, skb);
#endif
//设置skb->ip_summed的值
if (tp->cp_cmd & RxChkSum)
rtl8125_rx_csum(tp, skb, desc);
//设置一些skb相关的值
skb->dev = dev;
skb_put(skb, pkt_size);
skb->protocol = eth_type_trans(skb, dev);
if (skb->pkt_type == PACKET_MULTICAST)
RTLDEV->stats.multicast++;
if (rtl8125_rx_vlan_skb(tp, desc, skb) < 0)
//napi_gro_receive GRO是一种优化机制,让包尽可能在底层合并,给上层呈现一个大的数据包
rtl8125_rx_skb(tp, skb, ring_index);
#if LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0)
dev->last_rx = jiffies;
#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0)
RTLDEV->stats.rx_bytes += pkt_size;
RTLDEV->stats.rx_packets++;
}
cur_rx++;
entry = cur_rx % ring->num_rx_desc;
desc = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,37)
prefetch(desc);
#endif
}
//这里统计一下cur_rx往前推进的个数,也就是收了多少个包
count = cur_rx - ring->cur_rx;
ring->cur_rx = cur_rx;
//重新填充skb
delta = rtl8125_rx_fill(tp, ring, dev, ring->dirty_rx, ring->cur_rx, 1);
if (!delta && count && netif_msg_intr(tp))
printk(KERN_INFO "%s: no Rx buffer allocated\n", dev->name);
ring->dirty_rx += delta;
/*
* FIXME: until there is periodic timer to try and refill the ring,
* a temporary shortage may definitely kill the Rx process.
* - disable the asic to try and avoid an overflow and kick it again
* after refill ?
* - how do others driver handle this condition (Uh oh...).
*/
if ((ring->dirty_rx + ring->num_rx_desc == ring->cur_rx) && netif_msg_intr(tp))
printk(KERN_EMERG "%s: Rx buffers exhausted\n", dev->name);
rx_out:
return count;
}
static int
rtl8125_rx_interrupt(struct net_device *dev,
struct rtl8125_private *tp,
struct rtl8125_rx_ring *ring,
napi_budget budget)
{
unsigned int cur_rx, rx_left;
unsigned int delta, count = 0;
unsigned int entry;
struct RxDesc *desc;
u32 status;
u32 rx_quota;
u64 rx_buf_phy_addr;
u32 ring_index = ring->index;
assert(dev != NULL);
assert(tp != NULL);
if (ring->RxDescArray == NULL)
goto rx_out;
rx_quota = RTL_RX_QUOTA(budget);
//cur_rx是当前使用的desc,其前面的都是clean的
cur_rx = ring->cur_rx;
entry = cur_rx % ring->num_rx_desc;
//desc对应的地址,RxDescBase + cur_rx * RxDescLength
desc = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry);
//rx_left就是指还可以使用的desc数目
//dirty_rx是当前清理到的desc,其前面的都是dirty的,截止到cur_rx
rx_left = ring->num_rx_desc + ring->dirty_rx - cur_rx;
//这里把rx_left和budget取了一个最小值
rx_left = rtl8125_rx_quota(rx_left, (u32)rx_quota);
for (; rx_left > 0; rx_left--) {
status = le32_to_cpu(rtl8125_rx_desc_opts1(tp, desc));
//DescOwn = 1代表没有收到包,desc还是owned by nic
if (status & DescOwn)
break;
rmb();
//这里是error,一般不走这里
if (unlikely(rtl8125_check_rx_desc_error(dev, tp, status) < 0)) {
if (netif_msg_rx_err(tp)) {
printk(KERN_INFO
"%s: Rx ERROR. status = %08x\n",
dev->name, status);
}
RTLDEV->stats.rx_errors++;
if (dev->features & NETIF_F_RXALL)
goto process_pkt;
rtl8125_mark_to_asic(tp, desc, tp->rx_buf_sz);
} else {
struct sk_buff *skb;
int pkt_size;
process_pkt:
//得到packet size
pkt_size = status & 0x00003fff;
if (likely(!(dev->features & NETIF_F_RXFCS)))
pkt_size -= ETH_FCS_LEN;
/*
* The driver does not support incoming fragmented
* frames. They are seen as a symptom of over-mtu
* sized frames.
*/
if (unlikely(rtl8125_fragmented_frame(tp, status)) ||
unlikely(pkt_size > tp->rx_buf_sz)) {
RTLDEV->stats.rx_dropped++;
RTLDEV->stats.rx_length_errors++;
rtl8125_mark_to_asic(tp, desc, tp->rx_buf_sz);
continue;
}
//当前desc中的skb
skb = ring->Rx_skbuff[entry];
if (!skb)
break;
//phy addr for dma
rx_buf_phy_addr = ring->RxDescPhyAddr[entry];
//在DMA把数据从device搬到DDR后,在cpu 访问DDR之前调用,目的是为了让cpu看到最新的数据
dma_sync_single_for_cpu(tp_to_dev(tp),
rx_buf_phy_addr, tp->rx_buf_sz,
DMA_FROM_DEVICE);
if (rtl8125_try_rx_copy(tp, ring, &skb, pkt_size,
desc, tp->rx_buf_sz)) {
ring->Rx_skbuff[entry] = NULL;
dma_unmap_single(tp_to_dev(tp), rx_buf_phy_addr,
tp->rx_buf_sz, DMA_FROM_DEVICE);
} else {
// 在DMA把数据从DDR搬到device之前调用,目的是为了让device看到最新的数据
dma_sync_single_for_device(tp_to_dev(tp), rx_buf_phy_addr,
tp->rx_buf_sz, DMA_FROM_DEVICE);
}
#ifdef ENABLE_RSS_SUPPORT
rtl8125_rx_hash(tp, (struct RxDescV3 *)desc, skb);
#endif
//设置skb->ip_summed的值
if (tp->cp_cmd & RxChkSum)
rtl8125_rx_csum(tp, skb, desc);
//设置一些skb相关的值
skb->dev = dev;
skb_put(skb, pkt_size);
skb->protocol = eth_type_trans(skb, dev);
if (skb->pkt_type == PACKET_MULTICAST)
RTLDEV->stats.multicast++;
if (rtl8125_rx_vlan_skb(tp, desc, skb) < 0)
//napi_gro_receive GRO是一种优化机制,让包尽可能在底层合并,给上层呈现一个大的数据包
rtl8125_rx_skb(tp, skb, ring_index);
#if LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0)
dev->last_rx = jiffies;
#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0)
RTLDEV->stats.rx_bytes += pkt_size;
RTLDEV->stats.rx_packets++;
}
cur_rx++;
entry = cur_rx % ring->num_rx_desc;
desc = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,37)
prefetch(desc);
#endif
}
//这里统计一下cur_rx往前推进的个数,也就是收了多少个包
count = cur_rx - ring->cur_rx;
ring->cur_rx = cur_rx;
//重新填充skb
delta = rtl8125_rx_fill(tp, ring, dev, ring->dirty_rx, ring->cur_rx, 1);
if (!delta && count && netif_msg_intr(tp))
printk(KERN_INFO "%s: no Rx buffer allocated\n", dev->name);
ring->dirty_rx += delta;
/*
* FIXME: until there is periodic timer to try and refill the ring,
* a temporary shortage may definitely kill the Rx process.
* - disable the asic to try and avoid an overflow and kick it again
* after refill ?
* - how do others driver handle this condition (Uh oh...).
*/
if ((ring->dirty_rx + ring->num_rx_desc == ring->cur_rx) && netif_msg_intr(tp))
printk(KERN_EMERG "%s: Rx buffers exhausted\n", dev->name);
rx_out:
return count;
}
参考文献:
Linux 网络栈接收数据(RX):原理及内核实现(2022)
如果觉得这篇文章有用的话,可以点赞、评论或者收藏,万分感谢,goodbye~