每个 CPU 上的调度器会调度执行不同的线程,例如处理 OOM 的线程、处理 swap 的线程,以及我们的软中断处理线程,每个线程分配一定的时间片。如果此时调度到的是 ksoftirqd 线程,并且有 pending 的软中断等待处理, 那 thread_fn() 执行的就是 run_ksoftirqd()。一旦软中断代码判断出有 softirq 处于 pending 状态,就会开始处理, 执行 net_rx_action,从 ring buffer 收包。

net_rx_action函数中会执行网卡驱动注册的 poll() 方法,那么从这里开始看。

1 注册 poll()方法

最开始是一个宏

#define RTL_NAPI_CONFIG(ndev, priv, function, weight)   netif_napi_add(ndev, &priv->napi, function, weight)
#define RTL_NAPI_CONFIG(ndev, priv, function, weight)   netif_napi_add(ndev, &priv->napi, function, weight)

然后找到这个宏,在初始化那一篇中提到过。从这里可以知道,rx的poll方法是rtl8125_poll_msix_rx函数。然后给了一个weight,这个值是64。这个参数控制了网卡一次 poll() 时,最多允许处理的包数。

static void rtl8125_init_napi(struct rtl8125_private *tp)
{
        int i;

        for (i=0; i<tp->irq_nvecs; i++) {
                struct r8125_napi *r8125napi = &tp->r8125napi[i];
#ifdef CONFIG_R8125_NAPI
                int (*poll)(struct napi_struct *, int);

                if (tp->features & RTL_FEATURE_MSIX &&
                    tp->HwCurrIsrVer == 2) {
                        if (i < R8125_MAX_RX_QUEUES_VEC_V3)
                                poll = rtl8125_poll_msix_rx;
                        else if (i == 16 || i == 18)
                                poll = rtl8125_poll_msix_tx;
                        else
                                poll = rtl8125_poll_msix_other;
                } else {
                        poll = rtl8125_poll;
                }
                //这里注册了napi的poll方法
                RTL_NAPI_CONFIG(tp->dev, r8125napi, poll, R8125_NAPI_WEIGHT);
#endif

                r8125napi->priv = tp;
                r8125napi->index = i;
        }
}
static void rtl8125_init_napi(struct rtl8125_private *tp)
{
        int i;

        for (i=0; i<tp->irq_nvecs; i++) {
                struct r8125_napi *r8125napi = &tp->r8125napi[i];
#ifdef CONFIG_R8125_NAPI
                int (*poll)(struct napi_struct *, int);

                if (tp->features & RTL_FEATURE_MSIX &&
                    tp->HwCurrIsrVer == 2) {
                        if (i < R8125_MAX_RX_QUEUES_VEC_V3)
                                poll = rtl8125_poll_msix_rx;
                        else if (i == 16 || i == 18)
                                poll = rtl8125_poll_msix_tx;
                        else
                                poll = rtl8125_poll_msix_other;
                } else {
                        poll = rtl8125_poll;
                }
                //这里注册了napi的poll方法
                RTL_NAPI_CONFIG(tp->dev, r8125napi, poll, R8125_NAPI_WEIGHT);
#endif

                r8125napi->priv = tp;
                r8125napi->index = i;
        }
}

2 rtl8125_poll_msix_rx

net_rx_action会调用rtl8125_poll_msix_rx。

static int rtl8125_poll_msix_rx(napi_ptr napi, napi_budget budget)
{
        struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi);
        struct rtl8125_private *tp = r8125napi->priv;
        RTL_GET_NETDEV(tp)
        unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev);
        unsigned int work_done = 0;
        const int message_id = r8125napi->index;

        //收包的函数,最终return work_done也就是完成了多少次收包。
        work_done += rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[message_id], budget);
        //这里的budget就是上面的weight
        RTL_NAPI_QUOTA_UPDATE(dev, work_done, budget);

        //一般都是都是work_done < work_to_do
        if (work_done < work_to_do) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
                //napi_complete_done
                if (RTL_NETIF_RX_COMPLETE(dev, napi, work_done) == FALSE) return RTL_NAPI_RETURN_VALUE;
#else
                RTL_NETIF_RX_COMPLETE(dev, napi, work_done);
#endif
                /*
                 * 20040426: the barrier is not strictly required but the
                 * behavior of the irq handler could be less predictable
                 * without it. Btw, the lack of flush for the posted pci
                 * write is safe - FR
                 */
                smp_wmb();
                //enable对应的interrupt
                rtl8125_enable_hw_interrupt_v2(tp, message_id);
        }

        return RTL_NAPI_RETURN_VALUE;
}

static int rtl8125_poll_msix_rx(napi_ptr napi, napi_budget budget)
{
        struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi);
        struct rtl8125_private *tp = r8125napi->priv;
        RTL_GET_NETDEV(tp)
        unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev);
        unsigned int work_done = 0;
        const int message_id = r8125napi->index;

        //收包的函数,最终return work_done也就是完成了多少次收包。
        work_done += rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[message_id], budget);
        //这里的budget就是上面的weight
        RTL_NAPI_QUOTA_UPDATE(dev, work_done, budget);

        //一般都是都是work_done < work_to_do
        if (work_done < work_to_do) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
                //napi_complete_done
                if (RTL_NETIF_RX_COMPLETE(dev, napi, work_done) == FALSE) return RTL_NAPI_RETURN_VALUE;
#else
                RTL_NETIF_RX_COMPLETE(dev, napi, work_done);
#endif
                /*
                 * 20040426: the barrier is not strictly required but the
                 * behavior of the irq handler could be less predictable
                 * without it. Btw, the lack of flush for the posted pci
                 * write is safe - FR
                 */
                smp_wmb();
                //enable对应的interrupt
                rtl8125_enable_hw_interrupt_v2(tp, message_id);
        }

        return RTL_NAPI_RETURN_VALUE;
}

3 rtl8125_rx_interrupt

rtl8125_rx_interrupt是具体的收包函数,最终返回收包数目。

static int
rtl8125_rx_interrupt(struct net_device *dev,
                     struct rtl8125_private *tp,
                     struct rtl8125_rx_ring *ring,
                     napi_budget budget)
{
        unsigned int cur_rx, rx_left;
        unsigned int delta, count = 0;
        unsigned int entry;
        struct RxDesc *desc;
        u32 status;
        u32 rx_quota;
        u64 rx_buf_phy_addr;
        u32 ring_index = ring->index;

        assert(dev != NULL);
        assert(tp != NULL);

        if (ring->RxDescArray == NULL)
                goto rx_out;

        rx_quota = RTL_RX_QUOTA(budget);
        //cur_rx是当前使用的desc,其前面的都是clean的
        cur_rx = ring->cur_rx;
        entry = cur_rx % ring->num_rx_desc;
        //desc对应的地址,RxDescBase + cur_rx * RxDescLength
        desc = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry);
        //rx_left就是指还可以使用的desc数目        
        //dirty_rx是当前清理到的desc,其前面的都是dirty的,截止到cur_rx
        rx_left = ring->num_rx_desc + ring->dirty_rx - cur_rx;
        //这里把rx_left和budget取了一个最小值
        rx_left = rtl8125_rx_quota(rx_left, (u32)rx_quota);

        for (; rx_left > 0; rx_left--) {
                status = le32_to_cpu(rtl8125_rx_desc_opts1(tp, desc));
                //DescOwn = 1代表没有收到包,desc还是owned by nic
                if (status & DescOwn)
                        break;

                rmb();

                //这里是error,一般不走这里
                if (unlikely(rtl8125_check_rx_desc_error(dev, tp, status) < 0)) {
                        if (netif_msg_rx_err(tp)) {
                                printk(KERN_INFO
                                       "%s: Rx ERROR. status = %08x\n",
                                       dev->name, status);
                        }

                        RTLDEV->stats.rx_errors++;

                        if (dev->features & NETIF_F_RXALL)
                                goto process_pkt;

                        rtl8125_mark_to_asic(tp, desc, tp->rx_buf_sz);
                } else {
                        struct sk_buff *skb;
                        int pkt_size;

process_pkt:
                        //得到packet size
                        pkt_size = status & 0x00003fff;
                        if (likely(!(dev->features & NETIF_F_RXFCS)))
                                pkt_size -= ETH_FCS_LEN;

                        /*
                         * The driver does not support incoming fragmented
                         * frames. They are seen as a symptom of over-mtu
                         * sized frames.
                         */
                        if (unlikely(rtl8125_fragmented_frame(tp, status)) ||
                            unlikely(pkt_size > tp->rx_buf_sz)) {
                                RTLDEV->stats.rx_dropped++;
                                RTLDEV->stats.rx_length_errors++;
                                rtl8125_mark_to_asic(tp, desc, tp->rx_buf_sz);
                                continue;
                        }

                        //当前desc中的skb
                        skb = ring->Rx_skbuff[entry];

                        if (!skb)
                                break;

                        //phy addr for dma
                        rx_buf_phy_addr = ring->RxDescPhyAddr[entry];
                        //在DMA把数据从device搬到DDR后,在cpu 访问DDR之前调用,目的是为了让cpu看到最新的数据
                        dma_sync_single_for_cpu(tp_to_dev(tp),
                                                rx_buf_phy_addr, tp->rx_buf_sz,
                                                DMA_FROM_DEVICE);

                        if (rtl8125_try_rx_copy(tp, ring, &skb, pkt_size,
                                                desc, tp->rx_buf_sz)) {
                                ring->Rx_skbuff[entry] = NULL;
                                dma_unmap_single(tp_to_dev(tp), rx_buf_phy_addr,
                                                 tp->rx_buf_sz, DMA_FROM_DEVICE);
                        } else {
                                // 在DMA把数据从DDR搬到device之前调用,目的是为了让device看到最新的数据
                                dma_sync_single_for_device(tp_to_dev(tp), rx_buf_phy_addr,
                                                           tp->rx_buf_sz, DMA_FROM_DEVICE);
                        }

#ifdef ENABLE_RSS_SUPPORT
                        rtl8125_rx_hash(tp, (struct RxDescV3 *)desc, skb);
#endif

                        //设置skb->ip_summed的值
                        if (tp->cp_cmd & RxChkSum)
                                rtl8125_rx_csum(tp, skb, desc);

                        //设置一些skb相关的值
                        skb->dev = dev;
                        skb_put(skb, pkt_size);
                        skb->protocol = eth_type_trans(skb, dev);

                        if (skb->pkt_type == PACKET_MULTICAST)
                                RTLDEV->stats.multicast++;

                        if (rtl8125_rx_vlan_skb(tp, desc, skb) < 0)
                                //napi_gro_receive  GRO是一种优化机制,让包尽可能在底层合并,给上层呈现一个大的数据包
                                rtl8125_rx_skb(tp, skb, ring_index);

#if LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0)
                        dev->last_rx = jiffies;
#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0)
                        RTLDEV->stats.rx_bytes += pkt_size;
                        RTLDEV->stats.rx_packets++;
                }

                cur_rx++;
                entry = cur_rx % ring->num_rx_desc;
                desc = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,37)
                prefetch(desc);
#endif
        }

        //这里统计一下cur_rx往前推进的个数,也就是收了多少个包
        count = cur_rx - ring->cur_rx;
        ring->cur_rx = cur_rx;

        //重新填充skb
        delta = rtl8125_rx_fill(tp, ring, dev, ring->dirty_rx, ring->cur_rx, 1);
        if (!delta && count && netif_msg_intr(tp))
                printk(KERN_INFO "%s: no Rx buffer allocated\n", dev->name);
        ring->dirty_rx += delta;

        /*
         * FIXME: until there is periodic timer to try and refill the ring,
         * a temporary shortage may definitely kill the Rx process.
         * - disable the asic to try and avoid an overflow and kick it again
         *   after refill ?
         * - how do others driver handle this condition (Uh oh...).
         */
        if ((ring->dirty_rx + ring->num_rx_desc == ring->cur_rx) && netif_msg_intr(tp))
                printk(KERN_EMERG "%s: Rx buffers exhausted\n", dev->name);

rx_out:
        return count;
}

static int
rtl8125_rx_interrupt(struct net_device *dev,
                     struct rtl8125_private *tp,
                     struct rtl8125_rx_ring *ring,
                     napi_budget budget)
{
        unsigned int cur_rx, rx_left;
        unsigned int delta, count = 0;
        unsigned int entry;
        struct RxDesc *desc;
        u32 status;
        u32 rx_quota;
        u64 rx_buf_phy_addr;
        u32 ring_index = ring->index;

        assert(dev != NULL);
        assert(tp != NULL);

        if (ring->RxDescArray == NULL)
                goto rx_out;

        rx_quota = RTL_RX_QUOTA(budget);
        //cur_rx是当前使用的desc,其前面的都是clean的
        cur_rx = ring->cur_rx;
        entry = cur_rx % ring->num_rx_desc;
        //desc对应的地址,RxDescBase + cur_rx * RxDescLength
        desc = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry);
        //rx_left就是指还可以使用的desc数目        
        //dirty_rx是当前清理到的desc,其前面的都是dirty的,截止到cur_rx
        rx_left = ring->num_rx_desc + ring->dirty_rx - cur_rx;
        //这里把rx_left和budget取了一个最小值
        rx_left = rtl8125_rx_quota(rx_left, (u32)rx_quota);

        for (; rx_left > 0; rx_left--) {
                status = le32_to_cpu(rtl8125_rx_desc_opts1(tp, desc));
                //DescOwn = 1代表没有收到包,desc还是owned by nic
                if (status & DescOwn)
                        break;

                rmb();

                //这里是error,一般不走这里
                if (unlikely(rtl8125_check_rx_desc_error(dev, tp, status) < 0)) {
                        if (netif_msg_rx_err(tp)) {
                                printk(KERN_INFO
                                       "%s: Rx ERROR. status = %08x\n",
                                       dev->name, status);
                        }

                        RTLDEV->stats.rx_errors++;

                        if (dev->features & NETIF_F_RXALL)
                                goto process_pkt;

                        rtl8125_mark_to_asic(tp, desc, tp->rx_buf_sz);
                } else {
                        struct sk_buff *skb;
                        int pkt_size;

process_pkt:
                        //得到packet size
                        pkt_size = status & 0x00003fff;
                        if (likely(!(dev->features & NETIF_F_RXFCS)))
                                pkt_size -= ETH_FCS_LEN;

                        /*
                         * The driver does not support incoming fragmented
                         * frames. They are seen as a symptom of over-mtu
                         * sized frames.
                         */
                        if (unlikely(rtl8125_fragmented_frame(tp, status)) ||
                            unlikely(pkt_size > tp->rx_buf_sz)) {
                                RTLDEV->stats.rx_dropped++;
                                RTLDEV->stats.rx_length_errors++;
                                rtl8125_mark_to_asic(tp, desc, tp->rx_buf_sz);
                                continue;
                        }

                        //当前desc中的skb
                        skb = ring->Rx_skbuff[entry];

                        if (!skb)
                                break;

                        //phy addr for dma
                        rx_buf_phy_addr = ring->RxDescPhyAddr[entry];
                        //在DMA把数据从device搬到DDR后,在cpu 访问DDR之前调用,目的是为了让cpu看到最新的数据
                        dma_sync_single_for_cpu(tp_to_dev(tp),
                                                rx_buf_phy_addr, tp->rx_buf_sz,
                                                DMA_FROM_DEVICE);

                        if (rtl8125_try_rx_copy(tp, ring, &skb, pkt_size,
                                                desc, tp->rx_buf_sz)) {
                                ring->Rx_skbuff[entry] = NULL;
                                dma_unmap_single(tp_to_dev(tp), rx_buf_phy_addr,
                                                 tp->rx_buf_sz, DMA_FROM_DEVICE);
                        } else {
                                // 在DMA把数据从DDR搬到device之前调用,目的是为了让device看到最新的数据
                                dma_sync_single_for_device(tp_to_dev(tp), rx_buf_phy_addr,
                                                           tp->rx_buf_sz, DMA_FROM_DEVICE);
                        }

#ifdef ENABLE_RSS_SUPPORT
                        rtl8125_rx_hash(tp, (struct RxDescV3 *)desc, skb);
#endif

                        //设置skb->ip_summed的值
                        if (tp->cp_cmd & RxChkSum)
                                rtl8125_rx_csum(tp, skb, desc);

                        //设置一些skb相关的值
                        skb->dev = dev;
                        skb_put(skb, pkt_size);
                        skb->protocol = eth_type_trans(skb, dev);

                        if (skb->pkt_type == PACKET_MULTICAST)
                                RTLDEV->stats.multicast++;

                        if (rtl8125_rx_vlan_skb(tp, desc, skb) < 0)
                                //napi_gro_receive  GRO是一种优化机制,让包尽可能在底层合并,给上层呈现一个大的数据包
                                rtl8125_rx_skb(tp, skb, ring_index);

#if LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0)
                        dev->last_rx = jiffies;
#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0)
                        RTLDEV->stats.rx_bytes += pkt_size;
                        RTLDEV->stats.rx_packets++;
                }

                cur_rx++;
                entry = cur_rx % ring->num_rx_desc;
                desc = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,37)
                prefetch(desc);
#endif
        }

        //这里统计一下cur_rx往前推进的个数,也就是收了多少个包
        count = cur_rx - ring->cur_rx;
        ring->cur_rx = cur_rx;

        //重新填充skb
        delta = rtl8125_rx_fill(tp, ring, dev, ring->dirty_rx, ring->cur_rx, 1);
        if (!delta && count && netif_msg_intr(tp))
                printk(KERN_INFO "%s: no Rx buffer allocated\n", dev->name);
        ring->dirty_rx += delta;

        /*
         * FIXME: until there is periodic timer to try and refill the ring,
         * a temporary shortage may definitely kill the Rx process.
         * - disable the asic to try and avoid an overflow and kick it again
         *   after refill ?
         * - how do others driver handle this condition (Uh oh...).
         */
        if ((ring->dirty_rx + ring->num_rx_desc == ring->cur_rx) && netif_msg_intr(tp))
                printk(KERN_EMERG "%s: Rx buffers exhausted\n", dev->name);

rx_out:
        return count;
}

参考文献:

Linux 网络栈接收数据(RX):原理及内核实现(2022)

如果觉得这篇文章有用的话,可以点赞、评论或者收藏,万分感谢,goodbye~