概述
网络中断处理是Linux网络栈性能的关键因素,它直接影响系统的网络吞吐量和延迟。Linux网络中断处理的完整机制,包括硬中断处理、软中断机制、NAPI轮询以及各种中断优化技术。
1. 网络中断处理架构
1.1 中断处理的核心挑战
现代高速网络对中断处理提出了严峻挑战:
- 中断风暴:高速网络产生大量中断,消耗CPU资源
- 延迟控制:在保证低延迟的同时维持高吞吐量
- 多核扩展:有效利用多核CPU进行中断处理
- 内存带宽:避免中断处理成为内存访问瓶颈
- 功耗控制:在移动设备上优化功耗表现
1.2 网络中断处理架构图
graph TB
subgraph "硬件层"
NIC[网卡硬件]
DMA[DMA控制器]
PCIE[PCIe总线]
IOAPIC[I/O APIC]
end
subgraph "中断控制器"
MSI[MSI/MSI-X]
VECTOR[中断向量]
IRQ_DESC[中断描述符]
IRQ_CHIP[中断芯片]
end
subgraph "硬中断处理"
HARD_ISR[硬中断服务程序]
IRQ_HANDLER[中断处理函数]
NAPI_SCHEDULE[NAPI调度]
IRQ_DISABLE[中断禁用]
end
subgraph "软中断系统"
SOFTIRQ[软中断框架]
NET_RX[NET_RX_SOFTIRQ]
NET_TX[NET_TX_SOFTIRQ]
KSOFTIRQD[ksoftirqd线程]
end
subgraph "NAPI轮询机制"
NAPI_LIST[NAPI轮询列表]
POLL_FUNC[poll()函数]
GRO_PROC[GRO处理]
BUDGET_CTRL[预算控制]
end
subgraph "CPU调度"
CURR_CPU[当前CPU]
OTHER_CPU[其他CPU]
RPS_STEER[RPS引导]
IRQ_BALANCE[中断均衡]
end
subgraph "性能优化"
IRQ_COALESCE[中断合并]
IRQ_AFFINITY[中断亲和性]
THREADED_IRQ[线程化中断]
IRQ_POLL[中断轮询]
end
%% 硬件到中断控制器
NIC --> DMA
DMA --> PCIE
PCIE --> MSI
MSI --> VECTOR
VECTOR --> IRQ_DESC
IRQ_DESC --> IRQ_CHIP
%% 硬中断处理流程
IRQ_CHIP --> HARD_ISR
HARD_ISR --> IRQ_HANDLER
IRQ_HANDLER --> IRQ_DISABLE
IRQ_DISABLE --> NAPI_SCHEDULE
%% 软中断处理
NAPI_SCHEDULE --> SOFTIRQ
SOFTIRQ --> NET_RX
NET_RX --> NAPI_LIST
NAPI_LIST --> POLL_FUNC
POLL_FUNC --> GRO_PROC
%% 预算和调度控制
GRO_PROC --> BUDGET_CTRL
BUDGET_CTRL --> KSOFTIRQD
%% 多CPU处理
NET_RX -.-> RPS_STEER
RPS_STEER --> OTHER_CPU
IRQ_CHIP -.-> IRQ_BALANCE
%% 优化技术
IRQ_HANDLER -.-> IRQ_COALESCE
IRQ_CHIP -.-> IRQ_AFFINITY
SOFTIRQ -.-> THREADED_IRQ
POLL_FUNC -.-> IRQ_POLL
style HARD_ISR fill:#e1f5fe
style NET_RX fill:#f3e5f5
style NAPI_LIST fill:#e8f5e8
style GRO_PROC fill:#fff3e0
style IRQ_COALESCE fill:#fce4ec
2. 硬中断处理机制
2.1 中断描述符和处理
/**
* irq_desc - 中断描述符结构
*
* 每个中断号都对应一个中断描述符,包含中断的所有管理信息
*/
struct irq_desc {
struct irq_common_data irq_common_data; /* 通用中断数据 */
struct irq_data irq_data; /* 中断数据 */
unsigned int __percpu *kstat_irqs; /* 每CPU中断统计 */
irq_flow_handler_t handle_irq; /* 中断流控制器 */
struct irqaction *action; /* 中断动作链表 */
unsigned int status_use_accessors; /* 状态标志 */
unsigned int core_internal_state__do_not_mess_with_it; /* 内部状态 */
unsigned int depth; /* 禁用深度 */
unsigned int wake_depth; /* 唤醒深度 */
unsigned int tot_count; /* 总中断计数 */
unsigned int irq_count; /* 中断计数 */
unsigned long last_unhandled; /* 最后未处理时间 */
unsigned int irqs_unhandled; /* 未处理中断数 */
atomic_t threads_handled; /* 线程处理计数 */
int threads_handling; /* 正在处理的线程数 */
raw_spinlock_t lock; /* 描述符锁 */
struct cpumask *percpu_enabled; /* 每CPU启用掩码 */
const struct cpumask *percpu_affinity; /* 每CPU亲和性 */
const struct cpumask *affinity_hint; /* 亲和性提示 */
struct irq_affinity_notify *affinity_notify; /* 亲和性通知 */
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *dir; /* proc目录 */
#endif
#ifdef CONFIG_SPARSE_IRQ
struct rcu_head rcu; /* RCU头 */
struct kobject kobj; /* 内核对象 */
#endif
struct mutex request_mutex; /* 请求互斥锁 */
int parent_irq; /* 父中断 */
struct module *owner; /* 所有者模块 */
const char *name; /* 中断名称 */
} ____cacheline_internodealigned_in_smp;
/**
* irqaction - 中断动作结构
*
* 描述中断的具体处理动作,支持中断共享
*/
struct irqaction {
irq_handler_t handler; /* 中断处理函数 */
void *dev_id; /* 设备标识 */
void __percpu *percpu_dev_id; /* 每CPU设备标识 */
struct irqaction *next; /* 下一个动作(中断共享) */
irq_handler_t thread_fn; /* 线程化中断函数 */
struct task_struct *thread; /* 中断线程 */
struct irqaction *secondary; /* 辅助动作 */
unsigned int irq; /* 中断号 */
unsigned int flags; /* 中断标志 */
unsigned long thread_flags; /* 线程标志 */
unsigned long thread_mask; /* 线程掩码 */
const char *name; /* 动作名称 */
struct proc_dir_entry *dir; /* proc目录 */
} ____cacheline_internodealigned_in_smp;
/**
* request_irq - 请求中断
* @irq: 中断号
* @handler: 中断处理函数
* @flags: 中断标志
* @name: 中断名称
* @dev: 设备标识
*
* 注册网络设备的中断处理函数
* 返回值:成功返回0,失败返回负错误码
*/
static inline int __must_check
request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
const char *name, void *dev)
{
return request_threaded_irq(irq, handler, NULL, flags, name, dev);
}
/**
* 网络设备中断处理函数示例
* @irq: 中断号
* @dev_id: 设备标识
*
* 典型的网络设备硬中断处理函数
* 返回值:IRQ_HANDLED表示中断已处理
*/
static irqreturn_t e1000_intr(int irq, void *dev_id)
{
struct net_device *netdev = dev_id;
struct e1000_adapter *adapter = netdev_priv(netdev);
struct e1000_hw *hw = &adapter->hw;
u32 icr = er32(ICR);
if (unlikely((!icr)))
return IRQ_NONE; /* 不是我们的中断 */
/*
* IMS为0时,IMC和IMS的写入被忽略。
* 因为我们写入IMS和这里的ims_val,我们避免了竞争条件。
*/
ew32(IMC, ~0);
/* 如果接口关闭,清理并返回 */
if (unlikely(test_bit(__E1000_DOWN, &adapter->flags)))
return IRQ_HANDLED;
/* 检查是否为Rx中断 */
if (unlikely(icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC))) {
/* 处理链路状态变化 */
hw->get_link_status = 1;
/*
* 80003ES2LAN的ICR不会回写LSC中断位
* - 它与RXSEQ中断共享同一位
*/
mod_timer(&adapter->watchdog_timer, jiffies + 1);
}
/* 禁用中断,调度NAPI */
if (likely(napi_schedule_prep(&adapter->napi))) {
adapter->total_tx_bytes = 0;
adapter->total_tx_packets = 0;
adapter->total_rx_bytes = 0;
adapter->total_rx_packets = 0;
__napi_schedule(&adapter->napi);
} else {
/* 重新启用中断 */
if (!test_bit(__E1000_DOWN, &adapter->flags))
e1000_irq_enable(adapter);
}
return IRQ_HANDLED;
}
2.2 MSI/MSI-X中断机制
/**
* pci_enable_msix_range - 启用MSI-X中断
* @dev: PCI设备
* @entries: MSI-X条目数组
* @minvec: 最小向量数
* @maxvec: 最大向量数
*
* 为网络设备启用MSI-X中断,支持多队列
* 返回值:分配的向量数或错误码
*/
int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
int minvec, int maxvec)
{
int rc;
if (maxvec < minvec)
return -ERANGE;
for (;;) {
if (maxvec == 1) {
rc = pci_enable_msix_exact(dev, entries, maxvec);
if (rc < 0)
return rc;
return maxvec;
}
rc = pci_enable_msix(dev, entries, maxvec);
if (rc < 0) {
return rc;
} else if (rc > 0) {
if (rc < minvec)
return -ENOSPC;
maxvec = rc;
} else {
return maxvec;
}
}
}
/**
* 网络设备MSI-X中断设置示例
*/
static int e1000e_request_msix(struct e1000_adapter *adapter)
{
struct net_device *netdev = adapter->netdev;
int err = 0, vector = 0;
if (strlen(netdev->name) < (IFNAMSIZ - 5))
snprintf(adapter->rx_ring->name,
sizeof(adapter->rx_ring->name) - 1,
"%s-rx-0", netdev->name);
else
memcpy(adapter->rx_ring->name, netdev->name, IFNAMSIZ);
err = request_irq(adapter->msix_entries[vector].vector,
e1000_intr_msix_rx, 0, adapter->rx_ring->name,
netdev);
if (err)
return err;
adapter->rx_ring->itr_register = E1000_EITR(vector);
adapter->rx_ring->itr_val = adapter->itr;
vector++;
/* 发送中断 */
if (strlen(netdev->name) < (IFNAMSIZ - 5))
snprintf(adapter->tx_ring->name,
sizeof(adapter->tx_ring->name) - 1,
"%s-tx-0", netdev->name);
else
memcpy(adapter->tx_ring->name, netdev->name, IFNAMSIZ);
err = request_irq(adapter->msix_entries[vector].vector,
e1000_intr_msix_tx, 0, adapter->tx_ring->name,
netdev);
if (err)
return err;
adapter->tx_ring->itr_register = E1000_EITR(vector);
adapter->tx_ring->itr_val = adapter->itr;
vector++;
/* 其他中断(链路状态等) */
err = request_irq(adapter->msix_entries[vector].vector,
e1000_msix_other, 0, netdev->name, netdev);
if (err)
return err;
return 0;
}
3. 软中断机制详解
3.1 软中断框架实现
/**
* softirq_action - 软中断动作结构
*/
struct softirq_action {
void (*action)(struct softirq_action *);
};
/**
* 软中断类型定义
*/
enum {
HI_SOFTIRQ=0, /* 高优先级软中断 */
TIMER_SOFTIRQ, /* 定时器软中断 */
NET_TX_SOFTIRQ, /* 网络发送软中断 */
NET_RX_SOFTIRQ, /* 网络接收软中断 */
BLOCK_SOFTIRQ, /* 块设备软中断 */
IRQ_POLL_SOFTIRQ, /* IRQ轮询软中断 */
TASKLET_SOFTIRQ, /* Tasklet软中断 */
SCHED_SOFTIRQ, /* 调度软中断 */
HRTIMER_SOFTIRQ, /* 高精度定时器软中断 */
RCU_SOFTIRQ, /* RCU软中断 */
NR_SOFTIRQS /* 软中断数量 */
};
/**
* __do_softirq - 软中断处理主函数
*
* 处理待处理的软中断,这是软中断机制的核心
*/
asmlinkage __visible void __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
int max_restart = MAX_SOFTIRQ_RESTART;
struct softirq_action *h;
bool in_hardirq;
__u32 pending;
int softirq_bit;
/*
* 掩码出标志以防止无限递归。这可以被明确地清除
* 通过调用local_irq_enable()
*/
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
softirq_handle_begin();
in_hardirq = lockdep_softirq_start();
account_irq_enter_time(current);
restart:
/* 重置软中断映射 */
set_softirq_pending(0);
local_irq_enable();
h = softirq_vec;
while ((softirq_bit = ffs(pending))) {
unsigned int vec_nr;
int prev_count;
h += softirq_bit - 1;
vec_nr = h - softirq_vec;
prev_count = preempt_count();
kstat_incr_softirqs_this_cpu(vec_nr);
trace_softirq_entry(vec_nr);
h->action(h); /* 执行软中断处理函数 */
trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
vec_nr, softirq_to_name[vec_nr], h->action,
prev_count, preempt_count());
preempt_count_set(prev_count);
}
h++;
pending >>= softirq_bit;
}
local_irq_disable();
pending = local_softirq_pending();
if (pending) {
if (time_before(jiffies, end) && !need_resched() && --max_restart)
goto restart;
/*
* 如果软中断处理时间太长,唤醒ksoftirqd线程
*/
wakeup_softirqd();
}
account_irq_exit_time(current);
lockdep_softirq_end(in_hardirq);
softirq_handle_end();
current->flags = old_flags;
}
/**
* raise_softirq_irqoff - 触发软中断(中断已禁用)
* @nr: 软中断号
*
* 在中断已禁用的上下文中触发软中断
*/
void raise_softirq_irqoff(unsigned int nr)
{
__raise_softirq_irqoff(nr);
/*
* 如果我们在硬中断上下文中,不要立即处理软中断。
* 相反,当硬中断退出时它们会被触发。
*/
if (!in_interrupt())
wakeup_softirqd();
}
/**
* __raise_softirq_irqoff - 内部软中断触发
* @nr: 软中断号
*
* 设置软中断待处理位
*/
inline void __raise_softirq_irqoff(unsigned int nr)
{
lockdep_assert_irqs_disabled();
trace_softirq_raise(nr);
or_softirq_pending(1UL << nr);
}
3.2 ksoftirqd内核线程
/**
* ksoftirqd - 软中断处理内核线程
* @should_run: 是否应该运行
*
* 每个CPU都有一个ksoftirqd线程来处理软中断
*/
static void run_ksoftirqd(unsigned int cpu)
{
ksoftirqd_run_begin();
if (local_softirq_pending()) {
/*
* 我们可以安全地运行软中断,而不调用local_irq_enable()
* 因为软中断禁用被每个CPU标志跟踪,而不是每个进程标志。
*/
__do_softirq();
ksoftirqd_run_end();
cond_resched();
return;
}
ksoftirqd_run_end();
}
/**
* run_ksoftirqd - ksoftirqd线程主循环
* @cpu: CPU编号
*
* ksoftirqd线程的主要工作循环
*/
static int run_ksoftirqd(void *_cpu)
{
int cpu = (int)(long)_cpu;
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
preempt_disable();
if (!local_softirq_pending()) {
schedule_preempt_disabled();
}
__set_current_state(TASK_RUNNING);
while (local_softirq_pending()) {
/* 抢占必须被禁用,否则cpu可能改变 */
preempt_enable_no_resched();
__do_softirq();
preempt_disable();
cond_resched();
}
preempt_enable();
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return 0;
}
/**
* wakeup_softirqd - 唤醒ksoftirqd线程
*
* 当软中断处理时间过长时,唤醒ksoftirqd线程继续处理
*/
void wakeup_softirqd(void)
{
/* 不要唤醒ksoftirqd,如果我们在中断上下文中 */
if (in_interrupt())
return;
wake_up_process(this_cpu_ksoftirqd());
}
4. NAPI轮询机制深度分析
4.1 NAPI状态管理
/**
* NAPI状态位定义
*/
enum {
NAPI_STATE_SCHED, /* 已调度等待轮询 */
NAPI_STATE_MISSED, /* 轮询时又收到中断 */
NAPI_STATE_DISABLE, /* 禁用状态 */
NAPI_STATE_NPSVC, /* 正在轮询服务中 */
NAPI_STATE_LISTED, /* 在设备列表中 */
NAPI_STATE_NO_BUSY_POLL, /* 禁用busy polling */
NAPI_STATE_IN_BUSY_POLL, /* 在busy polling中 */
NAPI_STATE_PREFER_BUSY_POLL, /* 偏好busy polling */
NAPI_STATE_THREADED, /* 线程化NAPI */
NAPI_STATE_SCHED_THREADED, /* 线程化调度 */
};
/**
* napi_enable - 启用NAPI
* @n: NAPI结构指针
*
* 启用NAPI机制,允许轮询处理
*/
void napi_enable(struct napi_struct *n)
{
unsigned long new, val = READ_ONCE(n->state);
do {
BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
new |= NAPIF_STATE_SCHED_THREADED;
} while (cmpxchg(&n->state, val, new) != val);
}
/**
* napi_disable - 禁用NAPI
* @n: NAPI结构指针
*
* 禁用NAPI机制,停止轮询处理
*/
void napi_disable(struct napi_struct *n)
{
might_sleep();
set_bit(NAPI_STATE_DISABLE, &n->state);
while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
msleep(1);
while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
msleep(1);
hrtimer_cancel(&n->timer);
clear_bit(NAPI_STATE_DISABLE, &n->state);
clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
}
/**
* napi_schedule_irqoff - 在中断禁用状态下调度NAPI
* @n: NAPI结构指针
*
* 这是最常用的NAPI调度函数,通常在硬中断处理中调用
*/
static inline void napi_schedule_irqoff(struct napi_struct *n)
{
if (napi_schedule_prep(n))
__napi_schedule_irqoff(n);
}
/**
* __napi_schedule_irqoff - 内部NAPI调度(中断禁用)
* @n: NAPI结构指针
*
* 将NAPI添加到当前CPU的轮询列表中
*/
void __napi_schedule_irqoff(struct napi_struct *n)
{
____napi_schedule(this_cpu_ptr(&softnet_data), n);
}
/**
* napi_complete_done - 完成NAPI轮询
* @n: NAPI结构指针
* @work_done: 完成的工作量
*
* 当NAPI轮询完成时调用,重新启用硬件中断
* 返回值:是否成功完成
*/
bool napi_complete_done(struct napi_struct *n, int work_done)
{
unsigned long flags, val, new, timeout = 0;
bool ret = true;
/*
* 1) 不要让napi成为线程化的,如果在硬/软irq上下文之外被调用
* 2) 如果禁用了中断,我们需要原子操作,因为我们可能在竞争
* napi_disable()
*/
if (unlikely(in_irq() || irqs_disabled())) {
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
NAPIF_STATE_SCHED_THREADED);
/* 如果在服务中,不要完成 */
if (val & NAPIF_STATE_NPSVC)
new |= NAPIF_STATE_NPSVC;
val = cmpxchg(&n->state, val, new);
if (unlikely(val != (val & ~NAPIF_STATE_MISSED)))
goto reschedule;
ret = false;
}
if (n->gro_list) {
gro_normal_list(n);
if (work_done < n->weight && n->dev->gro_flush_timeout) {
/* 如果还有GRO包等待且没用完预算,设置定时器 */
timeout = n->dev->gro_flush_timeout;
hrtimer_start(&n->timer, ns_to_ktime(timeout),
HRTIMER_MODE_REL_PINNED);
}
}
if (unlikely(!list_empty(&n->poll_list))) {
/* 在完成过程中被重新调度了 */
WARN_ON_ONCE(1);
return false;
}
if (timeout)
hrtimer_start(&n->timer, ns_to_ktime(timeout),
HRTIMER_MODE_REL_PINNED);
return ret;
reschedule:
if (unlikely(val & NAPIF_STATE_MISSED)) {
__napi_schedule(n);
return false;
}
return ret;
}
5. 中断合并和节流
5.1 中断合并机制
/**
* ethtool_coalesce - 中断合并参数
*
* 这个结构定义了网络设备的中断合并参数,
* 用于平衡延迟和CPU利用率
*/
struct ethtool_coalesce {
__u32 cmd; /* 命令 */
/* 接收中断合并参数 */
__u32 rx_coalesce_usecs; /* 接收中断延迟(微秒) */
__u32 rx_max_coalesced_frames; /* 接收最大合并帧数 */
__u32 rx_coalesce_usecs_irq; /* 中断内接收延迟 */
__u32 rx_max_coalesced_frames_irq; /* 中断内最大帧数 */
/* 发送中断合并参数 */
__u32 tx_coalesce_usecs; /* 发送中断延迟(微秒) */
__u32 tx_max_coalesced_frames; /* 发送最大合并帧数 */
__u32 tx_coalesce_usecs_irq; /* 中断内发送延迟 */
__u32 tx_max_coalesced_frames_irq; /* 中断内最大帧数 */
/* 统计相关参数 */
__u32 stats_block_coalesce_usecs; /* 统计块延迟 */
/* 自适应中断合并 */
__u32 use_adaptive_rx_coalesce; /* 自适应接收合并 */
__u32 use_adaptive_tx_coalesce; /* 自适应发送合并 */
/* 包速率相关参数 */
__u32 pkt_rate_low; /* 低包速率阈值 */
__u32 rx_coalesce_usecs_low; /* 低速率接收延迟 */
__u32 rx_max_coalesced_frames_low; /* 低速率最大帧数 */
__u32 tx_coalesce_usecs_low; /* 低速率发送延迟 */
__u32 tx_max_coalesced_frames_low; /* 低速率最大帧数 */
__u32 pkt_rate_high; /* 高包速率阈值 */
__u32 rx_coalesce_usecs_high; /* 高速率接收延迟 */
__u32 rx_max_coalesced_frames_high; /* 高速率最大帧数 */
__u32 tx_coalesce_usecs_high; /* 高速率发送延迟 */
__u32 tx_max_coalesced_frames_high; /* 高速率最大帧数 */
/* 采样间隔 */
__u32 rate_sample_interval; /* 速率采样间隔 */
};
/**
* 自适应中断合并算法示例
*/
static void e1000e_update_itr(struct e1000_adapter *adapter,
u16 itr_setting, int packets, int bytes)
{
unsigned int retval = itr_setting;
if (packets == 0)
return;
switch (itr_setting) {
case lowest_latency:
/* 处理大量小包 */
if (bytes/packets > 8000)
retval = bulk_latency;
else if ((packets < 5) && (bytes > 512))
retval = low_latency;
break;
case low_latency: /* 50 usec aka 20000 ints/s */
if (bytes > 10000) {
/* 这可能意味着大量的小包或几个大包 */
if (bytes/packets > 8000)
retval = bulk_latency;
else if ((packets < 10) || ((bytes/packets) > 1200))
retval = bulk_latency;
else if ((packets > 35))
retval = lowest_latency;
} else if (bytes/packets > 2000) {
retval = bulk_latency;
} else if (packets <= 2 && bytes < 512) {
retval = lowest_latency;
}
break;
case bulk_latency: /* 250 usec aka 4000 ints/s */
if (bytes > 25000) {
if (packets > 35)
retval = low_latency;
} else if (bytes < 1500) {
retval = low_latency;
}
break;
}
adapter->rx_itr_setting = retval;
}
6. 网络中断亲和性管理
6.1 中断亲和性设置
/**
* irq_set_affinity - 设置中断亲和性
* @irq: 中断号
* @cpumask: CPU掩码
*
* 将中断绑定到特定的CPU集合
* 返回值:成功返回0,失败返回负错误码
*/
int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_chip *chip;
unsigned long flags;
int ret = 0;
if (!desc)
return -EINVAL;
raw_spin_lock_irqsave(&desc->lock, flags);
ret = irq_set_affinity_locked(desc, cpumask, false);
raw_spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
/**
* 网络设备中断亲和性优化示例
*/
static void setup_rx_irq_affinity(struct e1000_adapter *adapter)
{
struct net_device *netdev = adapter->netdev;
int i, cpu;
/* 为每个接收队列设置CPU亲和性 */
for (i = 0; i < adapter->num_rx_queues; i++) {
struct e1000_ring *rx_ring = &adapter->rx_ring[i];
/* 循环分配CPU */
cpu = i % num_online_cpus();
/* 设置中断亲和性 */
if (adapter->msix_entries) {
int vector = adapter->msix_entries[i].vector;
irq_set_affinity_hint(vector, cpumask_of(cpu));
}
/* 设置NAPI处理CPU */
netif_set_real_num_rx_queues(netdev, adapter->num_rx_queues);
}
}
/**
* irq_cpu_rmap_add - 添加CPU反向映射
* @rmap: CPU反向映射
* @irq: 中断号
*
* 建立中断到CPU的反向映射,用于RFS优化
* 返回值:映射索引或负错误码
*/
int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq)
{
int rc;
rc = cpu_rmap_add(rmap, irq_to_desc(irq));
if (rc < 0)
return rc;
irq_set_affinity_notifier(irq, &rmap->obj[rc].notify);
return rc;
}
6.2 中断负载均衡
/**
* irq_balance_info - 中断负载均衡信息
*/
struct irq_balance_info {
unsigned long last_move; /* 最后移动时间 */
unsigned int irq_delta; /* 中断变化量 */
unsigned long local_count; /* 本地计数 */
unsigned long global_count; /* 全局计数 */
struct list_head list; /* 均衡列表 */
struct cpumask allowed_mask; /* 允许的CPU掩码 */
unsigned int numa_node; /* NUMA节点 */
unsigned int package_id; /* 封装ID */
unsigned int core_id; /* 核心ID */
int class; /* 中断类别 */
unsigned long load; /* 负载值 */
};
/**
* irq_balance_work - 中断负载均衡工作函数
* @work: 工作结构
*
* 定期执行中断负载均衡,优化中断分布
*/
static void irq_balance_work(struct work_struct *work)
{
struct irq_balance_info *info;
struct cpumask target_mask;
int cpu, target_cpu;
unsigned long min_load = ULONG_MAX;
/* 遍历所有需要均衡的中断 */
list_for_each_entry(info, &irq_balance_list, list) {
if (time_before(jiffies, info->last_move + IRQ_BALANCE_INTERVAL))
continue;
/* 查找负载最轻的CPU */
cpumask_and(&target_mask, &info->allowed_mask, cpu_online_mask);
target_cpu = -1;
for_each_cpu(cpu, &target_mask) {
unsigned long load = cpu_irq_load(cpu);
if (load < min_load) {
min_load = load;
target_cpu = cpu;
}
}
/* 如果找到更好的CPU,移动中断 */
if (target_cpu != -1 &&
cpu_irq_load(target_cpu) < cpu_irq_load(info->current_cpu) - BALANCE_THRESHOLD) {
irq_set_affinity(info->irq, cpumask_of(target_cpu));
info->last_move = jiffies;
info->current_cpu = target_cpu;
}
}
/* 重新调度均衡工作 */
schedule_delayed_work(&irq_balance_work, IRQ_BALANCE_INTERVAL);
}
7. Busy Polling优化
7.1 用户空间忙轮询
/**
* sk_busy_loop - 套接字忙轮询
* @sk: 套接字
* @nonblock: 非阻塞标志
*
* 在用户空间系统调用中进行忙轮询,减少延迟
* 返回值:轮询结果
*/
void sk_busy_loop(struct sock *sk, int nonblock)
{
unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
int (*busy_poll_spin)(struct napi_struct *napi);
void *have_poll_lock = NULL;
struct napi_struct *napi;
unsigned int napi_id;
restart:
napi_id = READ_ONCE(sk->sk_napi_id);
if (napi_id < MIN_NAPI_ID)
return;
napi = napi_by_id(napi_id);
if (!napi)
return;
preempt_disable();
for (;;) {
int work = 0;
local_bh_disable();
if (!napi_disable_pending(napi)) {
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
local_bh_enable();
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
work = napi_poll(napi, BUSY_POLL_BUDGET);
trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
gro_normal_list(napi);
if (work > 0)
__NET_ADD_STATS(sock_net(sk),
LINUX_MIB_BUSYPOLLRXPACKETS, work);
local_bh_disable();
if (!netpoll_poll_lock(napi)) {
/* 失去锁,需要重新尝试 */
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
goto restart;
}
cpu_relax(); /* 让出CPU给其他任务 */
}
local_bh_enable();
if (nonblock || !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
busy_loop_timeout(end_time))
break;
if (unlikely(need_resched())) {
if (napi_poll)
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
rcu_read_unlock();
cond_resched();
if (unlikely(!skb_queue_empty_lockless(&sk->sk_receive_queue)))
return;
rcu_read_lock();
goto restart;
}
cpu_relax(); /* 等待更多数据包 */
}
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable();
}
/**
* busy_poll_stop - 停止忙轮询
* @napi: NAPI结构
* @have_poll_lock: 是否持有轮询锁
* @prefer_busy_poll: 是否偏好忙轮询
* @budget: 预算
*
* 结束忙轮询并恢复正常中断处理
*/
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
bool prefer_busy_poll, u16 budget)
{
bool skip_schedule = false;
unsigned long timeout;
int rc;
/* 忙轮询期间处理了数据包 */
if (rc > 0) {
/* 即使收到包,也检查是否需要重新调度NAPI */
if (rc < budget) {
napi_complete_done(napi, rc);
if (prefer_busy_poll) {
napi_schedule_prep(napi);
skip_schedule = true;
}
}
} else if (rc == 0) {
/* 没有收到包,完成NAPI */
napi_complete_done(napi, 0);
}
if (!skip_schedule) {
/* 重新启用中断 */
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
if (prefer_busy_poll) {
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
}
/* 如果有待处理的工作,重新调度 */
if (napi_disable_pending(napi)) {
napi_schedule(napi);
}
}
netpoll_poll_unlock(have_poll_lock);
}
8. 中断处理性能监控
8.1 中断统计信息
/**
* show_interrupts - 显示中断统计信息
* @p: seq_file指针
* @v: 数据指针
*
* /proc/interrupts的实现,显示系统中断统计
*/
int show_interrupts(struct seq_file *p, void *v)
{
static int prec;
unsigned long flags, any_count = 0;
int i = *(loff_t *) v, j;
struct irqaction *action;
struct irq_desc *desc;
if (i > ACTUAL_NR_IRQS)
return 0;
if (i == ACTUAL_NR_IRQS)
return arch_show_interrupts(p, prec);
/* 打印头部 */
if (i == 0) {
seq_printf(p, "%*s", prec + 8, "");
for_each_online_cpu(j)
seq_printf(p, "CPU%-8d", j);
seq_putc(p, '\n');
}
desc = irq_to_desc(i);
if (!desc || irq_settings_is_hidden(desc))
return 0;
raw_spin_lock_irqsave(&desc->lock, flags);
for_each_online_cpu(j)
any_count |= kstat_irqs_cpu(i, j);
if ((!desc->action || irq_desc_is_chained(desc)) && !any_count)
goto out;
seq_printf(p, "%*d: ", prec, i);
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
if (desc->irq_data.chip) {
if (desc->irq_data.chip->irq_print_chip)
desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
else if (desc->irq_data.chip->name)
seq_printf(p, " %8s", desc->irq_data.chip->name);
else
seq_printf(p, " %8s", "-");
} else {
seq_printf(p, " %8s", "None");
}
if (desc->irq_data.domain)
seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
else
seq_printf(p, " %*s", prec, "");
seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
if (desc->name)
seq_printf(p, "-%-8s", desc->name);
action = desc->action;
if (action) {
seq_printf(p, " %s", action->name);
while ((action = action->next) != NULL)
seq_printf(p, ", %s", action->name);
}
seq_putc(p, '\n');
out:
raw_spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
/**
* /proc/softirqs统计显示
*/
static int show_softirqs(struct seq_file *p, void *v)
{
int i, j;
seq_puts(p, " ");
for_each_possible_cpu(i)
seq_printf(p, "CPU%-8d", i);
seq_putc(p, '\n');
for (i = 0; i < NR_SOFTIRQS; i++) {
seq_printf(p, "%12s:", softirq_to_name[i]);
for_each_possible_cpu(j)
seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
seq_putc(p, '\n');
}
return 0;
}
9. 高级中断优化技术
9.1 线程化中断
/**
* request_threaded_irq - 请求线程化中断
* @irq: 中断号
* @handler: 硬中断处理函数
* @thread_fn: 线程中断处理函数
* @irqflags: 中断标志
* @devname: 设备名称
* @dev_id: 设备标识
*
* 注册线程化中断处理,将耗时操作移到内核线程中
* 返回值:成功返回0,失败返回负错误码
*/
int request_threaded_irq(unsigned int irq, irq_handler_t handler,
irq_handler_t thread_fn, unsigned long irqflags,
const char *devname, void *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
int retval;
if (irq == IRQ_NOTCONNECTED)
return -ENOTCONN;
/*
* 参数有效性检查
*/
if (((irqflags & IRQF_SHARED) && !dev_id) ||
(!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||
((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))
return -EINVAL;
desc = irq_to_desc(irq);
if (!desc)
return -EINVAL;
if (!irq_settings_can_request(desc) ||
WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return -EINVAL;
if (!handler) {
if (!thread_fn)
return -EINVAL;
handler = irq_default_primary_handler;
}
action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
if (!action)
return -ENOMEM;
action->handler = handler;
action->thread_fn = thread_fn;
action->flags = irqflags;
action->name = devname;
action->dev_id = dev_id;
retval = __setup_irq(irq, desc, action);
if (retval) {
kfree(action->secondary);
kfree(action);
}
#ifdef CONFIG_DEBUG_SHIRQ_FIXME
if (!retval && (irqflags & IRQF_SHARED)) {
/*
* 对于共享中断,确保旧驱动是安全的;
* 它们可能没有准备好处理新的中断。
*/
disable_irq(irq);
enable_irq(irq);
}
#endif
return retval;
}
/**
* irq_thread - 中断线程主函数
* @data: 中断动作数据
*
* 线程化中断的主要处理函数
* 返回值:线程退出码
*/
static int irq_thread(void *data)
{
struct callback_head on_exit_work;
struct irqaction *action = data;
struct irq_desc *desc = irq_to_desc(action->irq);
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
&action->thread_flags))
handler_fn = irq_forced_thread_fn;
else
handler_fn = irq_thread_fn;
init_task_work(&on_exit_work, irq_thread_dtor);
task_work_add(current, &on_exit_work, TWA_NONE);
irq_thread_check_affinity(desc, action);
while (!irq_wait_for_interrupt(action)) {
irqreturn_t action_ret;
irq_thread_check_affinity(desc, action);
action_ret = handler_fn(desc, action);
if (action_ret == IRQ_HANDLED)
atomic_inc(&desc->threads_handled);
migrate_disable();
add_interrupt_randomness(action->irq, 0);
migrate_enable();
cond_resched();
}
/*
* 这是exit路径。在这里task_work_add()的工作处理已移除我们自己
*/
return 0;
}
9.2 中断合并优化
/**
* irq_coalesce_params - 中断合并参数
*/
struct irq_coalesce_params {
u32 max_packets; /* 最大数据包数 */
u32 max_usecs; /* 最大微秒数 */
u32 pkt_rate_low; /* 低包速率 */
u32 pkt_rate_high; /* 高包速率 */
u32 rate_sample_interval; /* 采样间隔 */
bool use_adaptive; /* 使用自适应合并 */
};
/**
* adaptive_coalesce_update - 自适应合并更新
* @adapter: 网络适配器
* @packets: 数据包数
* @bytes: 字节数
* @usecs: 处理时间
*
* 根据当前网络负载动态调整中断合并参数
*/
static void adaptive_coalesce_update(struct net_adapter *adapter,
u32 packets, u32 bytes, u32 usecs)
{
struct irq_coalesce_params *coal = &adapter->coalesce;
u32 pps = 0; /* packets per second */
u32 rate;
if (usecs > 0)
pps = (packets * 1000000) / usecs;
/* 计算当前包速率 */
rate = (pps + coal->pkt_rate_low + coal->pkt_rate_high) / 3;
if (rate < coal->pkt_rate_low) {
/* 低速率:优化延迟 */
coal->max_usecs = min(coal->max_usecs / 2, 10U);
coal->max_packets = min(coal->max_packets / 2, 4U);
} else if (rate > coal->pkt_rate_high) {
/* 高速率:优化吞吐量 */
coal->max_usecs = min(coal->max_usecs * 2, 200U);
coal->max_packets = min(coal->max_packets * 2, 64U);
}
/* 更新硬件中断合并设置 */
adapter_update_coalesce_hw(adapter, coal);
}
/**
* napi_gro_frags - NAPI GRO分片处理
* @napi: NAPI结构
*
* 处理分片数据包的GRO聚合
* 返回值:GRO处理结果
*/
gro_result_t napi_gro_frags(struct napi_struct *napi)
{
struct sk_buff *skb = napi->skb;
gro_result_t ret;
trace_napi_gro_frags_entry(skb);
skb_gro_reset_offset(skb);
ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
trace_napi_gro_frags_exit(ret);
return ret;
}
10. 网络中断调试和性能分析
10.1 中断调试工具
/**
* /proc/net/softnet_stat显示软中断统计
*/
static int softnet_seq_show(struct seq_file *seq, void *v)
{
struct softnet_data *sd = v;
seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
sd->processed, sd->dropped, sd->time_squeeze, 0,
0, 0, 0, 0, /* 保留字段 */
sd->cpu_collision, sd->received_rps, flow_limit_count);
return 0;
}
/**
* 网络设备中断统计更新
*/
static inline void netdev_irq_stats_update(struct net_device *dev,
unsigned int packets,
unsigned int bytes)
{
struct netdev_irq_stats *stats;
stats = this_cpu_ptr(dev->irq_stats);
u64_stats_update_begin(&stats->syncp);
stats->packets += packets;
stats->bytes += bytes;
u64_stats_update_end(&stats->syncp);
}
/**
* net_rps_action - RPS动作处理
* @sd: 软网络数据
*
* 处理RPS相关的跨CPU工作
*/
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
struct softnet_data *remsd = sd->rps_ipi_list;
if (remsd) {
sd->rps_ipi_list = NULL;
local_irq_enable();
/* 发送IPI到其他CPU */
while (remsd) {
struct softnet_data *next = remsd->rps_ipi_next;
if (cpu_online(remsd->cpu))
smp_call_function_single_async(remsd->cpu, &remsd->csd);
remsd = next;
}
} else
#endif
local_irq_enable();
}
10.2 性能调优参数
/**
* 重要的网络中断调优参数
*/
/* 网络设备预算参数 */
int netdev_budget __read_mostly = 300; /* 软中断处理预算 */
unsigned int netdev_budget_usecs __read_mostly = 2000; /* 软中断时间预算(微秒) */
/* RPS/RFS参数 */
unsigned int rps_sock_flow_entries __read_mostly = 0; /* RPS套接字流表大小 */
/* GRO参数 */
int gro_normal_batch __read_mostly = 8; /* GRO正常批处理大小 */
/**
* 调优函数:设置网络中断处理参数
*/
static void tune_network_interrupts(struct net_device *dev)
{
struct ethtool_coalesce ec;
/* 获取当前中断合并设置 */
if (dev->ethtool_ops->get_coalesce)
dev->ethtool_ops->get_coalesce(dev, &ec, NULL, NULL);
/* 根据网络类型调整参数 */
if (dev->speed >= SPEED_10000) {
/* 10Gbps+网络:优化吞吐量 */
ec.rx_coalesce_usecs = 50;
ec.rx_max_coalesced_frames = 32;
ec.tx_coalesce_usecs = 50;
ec.tx_max_coalesced_frames = 32;
ec.use_adaptive_rx_coalesce = 1;
ec.use_adaptive_tx_coalesce = 1;
} else if (dev->speed >= SPEED_1000) {
/* 1Gbps网络:平衡延迟和吞吐量 */
ec.rx_coalesce_usecs = 25;
ec.rx_max_coalesced_frames = 16;
ec.tx_coalesce_usecs = 25;
ec.tx_max_coalesced_frames = 16;
ec.use_adaptive_rx_coalesce = 1;
ec.use_adaptive_tx_coalesce = 1;
} else {
/* 低速网络:优化延迟 */
ec.rx_coalesce_usecs = 10;
ec.rx_max_coalesced_frames = 4;
ec.tx_coalesce_usecs = 10;
ec.tx_max_coalesced_frames = 4;
ec.use_adaptive_rx_coalesce = 0;
ec.use_adaptive_tx_coalesce = 0;
}
/* 应用新的合并设置 */
if (dev->ethtool_ops->set_coalesce)
dev->ethtool_ops->set_coalesce(dev, &ec, NULL, NULL);
}
11. 网络中断处理最佳实践
11.1 中断亲和性配置
#!/bin/bash
# 网络中断亲和性配置脚本
# 获取网卡的中断号
get_irq_list() {
local interface=$1
grep $interface /proc/interrupts | awk -F: '{print $1}' | tr -d ' '
}
# 设置中断亲和性
set_irq_affinity() {
local irq=$1
local cpu=$2
echo $cpu > /proc/irq/$irq/smp_affinity_list
echo "IRQ $irq -> CPU $cpu"
}
# 为网卡设置中断亲和性
setup_network_irq_affinity() {
local interface=$1
local start_cpu=$2
local cpu=$start_cpu
for irq in $(get_irq_list $interface); do
set_irq_affinity $irq $cpu
cpu=$((cpu + 1))
# 避免使用超线程的第二个逻辑核心
if [ $((cpu % 2)) -eq 1 ]; then
cpu=$((cpu + 1))
fi
done
}
# 示例:为eth0设置中断亲和性,从CPU 2开始
setup_network_irq_affinity eth0 2
11.2 性能监控脚本
#!/bin/bash
# 网络中断性能监控脚本
monitor_network_interrupts() {
echo "=== 网络中断统计 ==="
# 显示软中断统计
echo "软中断统计:"
cat /proc/softirqs | grep -E "(CPU|NET_RX|NET_TX)"
# 显示网络设备中断
echo -e "\n网络设备中断:"
grep -E "(eth|wlan|enp)" /proc/interrupts
# 显示softnet统计
echo -e "\nsoftnet统计:"
cat /proc/net/softnet_stat
# RPS配置
echo -e "\nRPS配置:"
for dev in /sys/class/net/*/queues/rx-*/rps_cpus; do
if [ -r "$dev" ]; then
echo "$dev: $(cat $dev)"
fi
done
# 中断合并设置
echo -e "\n中断合并设置:"
for iface in $(ls /sys/class/net/ | grep -E "^(eth|enp|wlan)"); do
if [ -d "/sys/class/net/$iface/device" ]; then
echo "$iface:"
ethtool -c $iface 2>/dev/null | grep -E "(rx-usecs|rx-frames|tx-usecs|tx-frames)"
fi
done
}
# 连续监控
while true; do
clear
monitor_network_interrupts
sleep 5
done
12. 总结
Linux网络中断处理是一个复杂而精妙的系统,通过硬中断、软中断、NAPI轮询等多层机制,实现了高性能的网络数据处理:
12.1 关键技术要点
- 分层处理:硬中断负责快速响应,软中断处理复杂逻辑
- NAPI机制:通过轮询减少中断频率,提升吞吐量
- 中断合并:聚合多个中断减少CPU开销
- 多核扩展:通过RPS/RFS技术实现多核并行处理
- 自适应优化:根据负载动态调整处理策略
12.2 性能优化建议
- 合理设置中断亲和性:避免中断集中在单个CPU
- 启用中断合并:在高吞吐量场景下减少中断频率
- 配置RPS/RFS:提升多核系统的网络性能
- 使用busy polling:在低延迟要求的场景中使用
- 监控中断统计:定期检查中断分布和处理效率
理解这些机制对于高性能网络应用开发和系统调优具有重要价值。
13. 关键函数与调用链/时序图/结构体关系
13.1 关键函数核心代码与功能说明
/* 软中断主循环 */
asmlinkage __visible void __do_softirq(void);
/* NET_RX_SOFTIRQ 入口:轮询所有NAPI */
static __latent_entropy void net_rx_action(struct softirq_action *h);
/* NAPI 调度/完成 */
static inline void napi_schedule(struct napi_struct *n);
bool napi_complete_done(struct napi_struct *n, int work_done);
/* 中断请求/释放(含MSI/MSI-X) */
int request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
const char *name, void *dev);
void free_irq(unsigned int irq, void *dev);
/* 设置中断亲和/均衡 */
int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
int minvec, int maxvec);
- 功能说明
__do_softirq
:软中断处理主函数,循环处理挂起软中断,必要时唤醒ksoftirqd
。net_rx_action
:NET_RX_SOFTIRQ 的处理,遍历/轮询NAPI实例并按预算/时间片调度。napi_schedule/napi_complete_done
:在中断上下文中调度NAPI、在完成后做GRO flush并原子清理状态。request_irq/free_irq
:注册/释放设备中断处理,支持共享与线程化中断。irq_set_affinity/pci_enable_msix_range
:亲和性与多队列中断管理,支撑多核扩展。
13.2 中断/软中断/NAPI 调用链
-
RX路径
- NIC 触发中断 -> 驱动ISR(禁中断) ->
napi_schedule
->__raise_softirq_irqoff(NET_RX)
->__do_softirq
->net_rx_action
->napi->poll
(驱动) ->napi_complete_done
-> 重新使能中断
- NIC 触发中断 -> 驱动ISR(禁中断) ->
-
TX完成
- NIC TX完成中断 -> 驱动ISR -> 标记TX清理 -> NAPI poll或专用TX清理 -> 协议栈统计/唤醒队列
-
自适应中断合并(示例)
- 驱动统计收集 -> 更新
itr
/coalesce 参数 -> 写NIC寄存器 -> 生效后观察软中断负载变化
- 驱动统计收集 -> 更新
13.3 中断/软中断/NAPI 时序图
sequenceDiagram
participant NIC as 网卡
participant ISR as 硬中断
participant SOFT as 软中断
participant NAPI as NAPI轮询
participant DRV as 驱动
NIC->>ISR: 中断到达
ISR->>ISR: 禁用/屏蔽队列中断
ISR->>SOFT: __raise_softirq_irqoff(NET_RX)
SOFT->>SOFT: __do_softirq()
SOFT->>NAPI: net_rx_action()
NAPI->>DRV: poll() 批量处理RX/TX完成
alt 处理完/预算未用尽
NAPI->>ISR: napi_complete_done()
ISR->>NIC: 重新使能中断
else 仍有剩余
NAPI->>SOFT: 留在repoll队列
end
13.4 关键结构体关系图(中断/软中断/NAPI)
classDiagram
class irq_desc {+handle_irq +action +affinity_hint}
class irqaction {+handler +thread_fn +name +flags}
class softirq_action {+action}
class napi_struct {+state +weight +poll() +timer}
class net_device {+name +netdev_ops}
irq_desc --> irqaction : 动作链
softirq_action --> napi_struct : NET_RX_SOFTIRQ
napi_struct --> net_device : 所属设备