/* The interrupt vector interrupt controller sent to
* each cpu is per cpu local variable, but the abstract
* layer's virtual signal irq and it's irq_desc is global.
* So per cpu needs its own mapping from vector to irq_desc */
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
struct irq_desc {
struct irq_common_data irq_common_data;
struct irq_data irq_data;
unsigned int __percpu *kstat_irqs;
irq_flow_handler_t handle_irq;
struct irqaction *action; /* IRQ action list */
struct cpumask *percpu_enabled;
wait_queue_head_t wait_for_threads;
};
struct irqaction {
irq_handler_t handler; /* typedef irqreturn_t (*irq_handler_t)(int, void *) */
void *dev_id;
void __percpu *percpu_dev_id;
struct irqaction *next;
irq_handler_t thread_fn;
struct task_struct *thread;
struct irqaction *secondary;
unsigned int irq;
unsigned int flags;
unsigned long thread_flags;
unsigned long thread_mask;
const char *name;
struct proc_dir_entry *dir;
};
void __init init_IRQ(void)
{
init_irq_stacks();
init_irq_scs();
irqchip_init() {
of_irq_init(__irqchip_of_table/* matches */) {
const struct of_device_id *match;
struct device_node *np, *parent = NULL;
struct of_intc_desc *desc, *temp_desc;
struct list_head intc_desc_list, intc_parent_list;
INIT_LIST_HEAD(&intc_desc_list);
INIT_LIST_HEAD(&intc_parent_list);
for_each_matching_node_and_match(np, matches, &match) {
if (!of_property_read_bool(np, "interrupt-controller") ||
!of_device_is_available(np))
continue;
if (WARN(!match->data, "of_irq_init: no init function for %s\n",
match->compatible))
continue;
/*
* Here, we allocate and populate an of_intc_desc with the node
* pointer, interrupt-parent device_node etc.
*/
desc = kzalloc(sizeof(*desc), GFP_KERNEL);
if (!desc) {
of_node_put(np);
goto err;
}
desc->irq_init_cb = match->data;
desc->dev = of_node_get(np);
/*
* interrupts-extended can reference multiple parent domains.
* Arbitrarily pick the first one; assume any other parents
* are the same distance away from the root irq controller.
*/
desc->interrupt_parent = of_parse_phandle(np, "interrupts-extended", 0);
if (!desc->interrupt_parent)
desc->interrupt_parent = of_irq_find_parent(np);
if (desc->interrupt_parent == np) {
of_node_put(desc->interrupt_parent);
desc->interrupt_parent = NULL;
}
list_add_tail(&desc->list, &intc_desc_list);
}
/*
* The root irq controller is the one without an interrupt-parent.
* That one goes first, followed by the controllers that reference it,
* followed by the ones that reference the 2nd level controllers, etc.
*/
while (!list_empty(&intc_desc_list)) {
/*
* Process all controllers with the current 'parent'.
* First pass will be looking for NULL as the parent.
* The assumption is that NULL parent means a root controller.
*/
list_for_each_entry_safe(desc, temp_desc, &intc_desc_list, list) {
int ret;
if (desc->interrupt_parent != parent)
continue;
list_del(&desc->list);
of_node_set_flag(desc->dev, OF_POPULATED);
ret = desc->irq_init_cb(desc->dev, desc->interrupt_parent) {
gic_of_init(); /* for gic_v3 */
}
if (ret) {
pr_err("%s: Failed to init %pOF (%p), parent %p\n",
__func__, desc->dev, desc->dev,
desc->interrupt_parent);
of_node_clear_flag(desc->dev, OF_POPULATED);
kfree(desc);
continue;
}
/*
* This one is now set up; add it to the parent list so
* its children can get processed in a subsequent pass.
*/
list_add_tail(&desc->list, &intc_parent_list);
}
/* Get the next pending parent that might have children */
desc = list_first_entry_or_null(&intc_parent_list,
typeof(*desc), list);
if (!desc) {
pr_err("of_irq_init: children remain, but no parents\n");
break;
}
list_del(&desc->list);
parent = desc->dev;
kfree(desc);
}
list_for_each_entry_safe(desc, temp_desc, &intc_parent_list, list) {
list_del(&desc->list);
kfree(desc);
}
err:
list_for_each_entry_safe(desc, temp_desc, &intc_desc_list, list) {
list_del(&desc->list);
of_node_put(desc->dev);
kfree(desc);
}
}
acpi_probe_device_table(irqchip);
#define acpi_probe_device_table(t) \
({ \
extern struct acpi_probe_entry ACPI_PROBE_TABLE(t), \
ACPI_PROBE_TABLE_END(t); \
__acpi_probe_device_table(&ACPI_PROBE_TABLE(t), \
(&ACPI_PROBE_TABLE_END(t) - \
&ACPI_PROBE_TABLE(t))); \
})
}
if (system_uses_irq_prio_masking()) {
/*
* Now that we have a stack for our IRQ handler, set
* the PMR/PSR pair to a consistent state.
*/
WARN_ON(read_sysreg(daif) & PSR_A_BIT);
local_daif_restore(DAIF_PROCCTX_NOIRQ);
}
}
Main Components:
-
Distributor (GICD)
Primary Functions:
- Manages Shared Peripheral Interrupts (SPIs)
- Controls interrupt routing to Redistributors
- Handles interrupt prioritization
- Manages interrupt enable/disable
Key Operations:
- Group assignment (G0/G1S/G1NS)
- Priority assignment
- Target processor selection
- Interrupt enabling/disabling
- Security state control
-
Redistributor (GICR) - one per PE
Primary Functions:
- One per Processing Element (PE)
- Manages SGIs and PPIs
- Controls power management
- Handles LPI configuration
Key Features:
- Local interrupt routing
- Wake-up logic
- PE interface management
- LPI configuration tables
- PPIs/SGIs configuration
-
CPU Interface (GICR_SGI/GICR_PPI)
Primary Functions:
- Presents interrupts to PE
- Handles interrupt acknowledgment
- Manages EOI (End Of Interrupt)
- Controls priority masking
Key Registers:
- ICC_IAR: Interrupt Acknowledge
- ICC_EOIR: End of Interrupt
- ICC_PMR: Priority Mask
- ICC_BPR: Binary Point
- ICC_RPR: Running Priority
-
ITS (Interrupt Translation Service)
Primary Functions:
- MSI/MSI-X interrupt translation
- Device ID management
- Event ID to LPI mapping
- Collection management
Key Operations:
- MAPD: Device mapping
- MAPC: Collection mapping
- MAPTI: Translation mapping
- MOVI: Interrupt movement
- DISCARD: Entry removal
Interrupt Types:
- SGI (Software Generated Interrupts): 0-15
- PPI (Private Peripheral Interrupts): 16-31
- SPI (Shared Peripheral Interrupts): 32-1019
- LPI (Locality-specific Peripheral Interrupts): 8192+
IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init);
#define IRQCHIP_DECLARE(name, compat, fn) \
OF_DECLARE_2(irqchip, name, compat, typecheck_irq_init_cb(fn))
#define OF_DECLARE_2(table, name, compat, fn) \
_OF_DECLARE(table, name, compat, fn, of_init_fn_2)
#define _OF_DECLARE(table, name, compat, fn, fn_type) \
static const struct of_device_id __of_table_##name \
__used __section("__" #table "_of_table") \
__aligned(__alignof__(struct of_device_id)) \
= { .compatible = compat, \
.data = (fn == (fn_type)NULL) ? fn : fn }
int __init
gic_of_init(struct device_node *node, struct device_node *parent)
{
struct gic_chip_data *gic;
int irq, ret;
gic = &gic_data[gic_cnt];
ret = gic_of_setup(gic, node);
if (gic_cnt == 0 && !gic_check_eoimode(node, &gic->raw_cpu_base))
static_branch_disable(&supports_deactivate_key);
ret = __gic_init_bases(gic, &node->fwnode) {
if (gic == &gic_data[0]) {
for (i = 0; i < NR_GIC_CPU_IF; i++)
gic_cpu_map[i] = 0xff;
set_handle_irq(gic_handle_irq); /* called at do_interrupt_handler */
}
ret = gic_init_bases(gic, handle);
if (gic == &gic_data[0])
gic_smp_init();
return ret;
}
if (!gic_cnt) {
gic_init_physaddr(node);
gic_of_setup_kvm_info(node);
}
if (parent) {
irq = irq_of_parse_and_map(node, 0);
gic_cascade_irq(gic_cnt, irq);
}
if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
gicv2m_init(&node->fwnode, gic_data[gic_cnt].domain);
gic_cnt++;
return 0;
}
void __init gic_smp_init(void)
{
struct irq_fwspec sgi_fwspec = {
.fwnode = gic_data.fwnode,
.param_count = 1,
};
int base_sgi;
cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN,
"irqchip/arm/gicv3:checkrdist",
gic_check_rdist, NULL);
cpuhp_setup_state_nocalls(CPUHP_AP_IRQ_GIC_STARTING,
"irqchip/arm/gicv3:starting",
gic_starting_cpu, NULL);
/* Register all 8 non-secure SGIs */
base_sgi = irq_domain_alloc_irqs(gic_data.domain, 8, NUMA_NO_NODE, &sgi_fwspec);
if (WARN_ON(base_sgi <= 0))
return;
set_smp_ipi_range(base_sgi, 8);
}
set_smp_ipi_range(int ipi_base, int n) {
int i;
nr_ipi = min(n, MAX_IPI);
for (i = 0; i < nr_ipi; i++) {
int err;
err = request_percpu_irq(ipi_base + i, ipi_handler,
"IPI", &irq_stat);
WARN_ON(err);
ipi_desc[i] = irq_to_desc(ipi_base + i);
irq_set_status_flags(ipi_base + i, IRQ_HIDDEN);
}
ipi_irq_base = ipi_base;
/* Setup the boot CPU immediately */
ipi_setup(smp_processor_id());
}
gic_of_init() {
gic_init_bases() {
set_handle_irq(gic_handle_irq);
}
}
void (*handle_arch_irq)(struct pt_regs *) __ro_after_init = default_handle_irq;
void (*handle_arch_fiq)(struct pt_regs *) __ro_after_init = default_handle_fiq;
int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
{
if (handle_arch_irq != default_handle_irq)
return -EBUSY;
handle_arch_irq = handle_irq;
pr_info("Root IRQ handler: %ps\n", handle_irq);
return 0;
}
/* el0_irq -> irq_handler */
static void __exception_irq_entry gic_handle_irq(struct pt_regs *regs)
{
if (unlikely(gic_supports_nmi() && !interrupts_enabled(regs))) {
__gic_handle_irq_from_irqsoff(regs);
} else {
__gic_handle_irq_from_irqson(regs) {
bool is_nmi;
u32 irqnr;
irqnr = gic_read_iar();
is_nmi = gic_rpr_is_nmi_prio();
if (is_nmi) {
nmi_enter();
__gic_handle_nmi(irqnr, regs);
nmi_exit();
}
if (gic_prio_masking_enabled()) {
gic_pmr_mask_irqs();
gic_arch_enable_irqs();
}
if (!is_nmi) {
__gic_handle_irq(irqnr, regs) {
if (gic_irqnr_is_special(irqnr))
return;
gic_complete_ack(irqnr) {
if (static_branch_likely(&supports_deactivate_key)) {
write_gicreg(irqnr, ICC_EOIR1_EL1);
}
isb();
}
ret = generic_handle_domain_irq(gic_data.domain, irqnr);
if (ret) {
WARN_ONCE(true, "Unexpected interrupt (irqnr %u)\n", irqnr);
gic_deactivate_unhandled(irqnr);
}
}
}
}
}
}
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
{
struct irq_desc desc = irq_resolve_mapping(domain, hwirq, NULL/*irq*/) {
}
return handle_irq_desc(desc) {
generic_handle_irq_desc(desc) {
desc->handle_irq(desc) {
irqreturn_t retval = IRQ_NONE;
unsigned int irq = desc->irq_data.irq;
struct irqaction *action;
record_irq_time(desc);
for_each_action_of_desc(desc, action) {
irqreturn_t res;
res = act
/* hanlder points to real hanlder for non-threaded handler,
* return IRQ_WAKE_THREAD for threaded handler */
res = action->handler(irq, action->dev_id);
switch (res) {
case IRQ_WAKE_THREAD:
__irq_wake_thread(desc, action);
case IRQ_HANDLED:
*flags |= action->flags;
break;
default:
break;
}
retval |= res;
}
return retval;
}
}
}
}
static int logibm_open(struct input_dev *dev)
{
if (request_irq(logibm_irq, logibm_interrupt, 0, "logibm", NULL)) {
return -EBUSY;
}
outb(LOGIBM_ENABLE_IRQ, LOGIBM_CONTROL_PORT);
return 0;
}
static irqreturn_t logibm_interrupt(int irq, void *dev_id)
{
}
irqreturn_t (*irq_handler_t)(int irq, void * dev_id);
enum irqreturn {
IRQ_NONE = (0 << 0),
IRQ_HANDLED = (1 << 0),
IRQ_WAKE_THREAD = (1 << 1),
};
static inline int request_irq(
unsigned int irq, irq_handler_t handler,
unsigned long flags, const char *name, void *dev)
{
return request_threaded_irq(irq, handler, NULL, flags, name, dev);
}
int request_threaded_irq(
unsigned int irq, irq_handler_t handler,
irq_handler_t thread_fn, unsigned long irqflags,
const char *devname, void *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
int retval;
desc = irq_to_desc(irq);
action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
action->handler = handler;
action->thread_fn = thread_fn;
action->flags = irqflags;
action->name = devname;
action->dev_id = dev_id;
retval = __setup_irq(irq, desc, action);
}
static int
__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
{
struct irqaction *old, **old_ptr;
unsigned long flags, thread_mask = 0;
int ret, nested, shared = 0;
new->irq = irq;
if (new->thread_fn && !nested) {
ret = setup_irq_thread(new, irq, false);
}
old_ptr = &desc->action;
old = *old_ptr;
/* add new interrupt at end of irq queue */
if (old) {
do {
thread_mask |= old->thread_mask;
old_ptr = &old->next;
old = *old_ptr;
} while (old);
}
*old_ptr = new;
if (new->thread)
wake_up_process(new->thread);
}
int wake_up_process(struct task_struct *p)
{
return try_to_wake_up(p, TASK_NORMAL, 0);
}
static int setup_irq_thread(
struct irqaction *new, unsigned int irq, bool secondary)
{
struct task_struct *t;
struct sched_param param = {
.sched_priority = MAX_USER_RT_PRIO/2,
};
t = kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name);
sched_setscheduler_nocheck(t, SCHED_FIFO, ¶m);
get_task_struct(t);
new->thread = t;
return 0;
}
request_percpu_irq(unsigned int irq, irq_handler_t handler,
const char *devname, void __percpu *percpu_dev_id) {
return __request_percpu_irq(irq, handler, 0, devname, percpu_dev_id) {
desc = irq_to_desc(irq);
action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
action->handler = handler;
action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
action->name = devname;
action->percpu_dev_id = dev_id;
retval = irq_chip_pm_get(&desc->irq_data);
retval = __setup_irq(irq, desc, action /*new*/) {
new->irq = irq;
nested = irq_settings_is_nested_thread(desc);
if (nested) {
if (!new->thread_fn) {
ret = -EINVAL;
goto out_mput;
}
new->handler = irq_nested_primary_handler;
} else {
if (irq_settings_can_thread(desc)) {
ret = irq_setup_forced_threading(new);
if (ret)
goto out_mput;
}
}
/* Create a handler thread when a thread function is supplied
* and the interrupt does not nest into another interrupt thread. */
if (new->thread_fn && !nested) {
ret = setup_irq_thread(new, irq, false) {
kthread_create(irq_thread, new, "irq/%d-%s", irq, new->name);
}
if (ret)
goto out_mput;
if (new->secondary) {
ret = setup_irq_thread(new->secondary, irq, true);
if (ret)
goto out_thread;
}
}
if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
new->flags &= ~IRQF_ONESHOT;
mutex_lock(&desc->request_mutex);
chip_bus_lock(desc);
/* First installed action requests resources. */
if (!desc->action) {
ret = irq_request_resources(desc);
if (ret) {
goto out_bus_unlock;
}
}
raw_spin_lock_irqsave(&desc->lock, flags);
old_ptr = &desc->action;
old = *old_ptr;
if (old) {
unsigned int oldtype;
if (desc->istate & IRQS_NMI) {
ret = -EINVAL;
goto out_unlock;
}
if (irqd_trigger_type_was_set(&desc->irq_data)) {
oldtype = irqd_get_trigger_type(&desc->irq_data);
} else {
oldtype = new->flags & IRQF_TRIGGER_MASK;
irqd_set_trigger_type(&desc->irq_data, oldtype);
}
if (!((old->flags & new->flags) & IRQF_SHARED) ||
(oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
((old->flags ^ new->flags) & IRQF_ONESHOT))
goto mismatch;
/* All handlers must agree on per-cpuness */
if ((old->flags & IRQF_PERCPU) !=
(new->flags & IRQF_PERCPU))
goto mismatch;
/* add new interrupt at end of irq queue */
do {
thread_mask |= old->thread_mask;
old_ptr = &old->next;
old = *old_ptr;
} while (old);
shared = 1;
}
/*
* Setup the thread mask for this irqaction for ONESHOT. For
* !ONESHOT irqs the thread mask is 0 so we can avoid a
* conditional in irq_wake_thread().
*/
if (new->flags & IRQF_ONESHOT) {
/*
* Unlikely to have 32 resp 64 irqs sharing one line,
* but who knows.
*/
if (thread_mask == ~0UL) {
ret = -EBUSY;
goto out_unlock;
}
/*
* The thread_mask for the action is or'ed to
* desc->thread_active to indicate that the
* IRQF_ONESHOT thread handler has been woken, but not
* yet finished. The bit is cleared when a thread
* completes. When all threads of a shared interrupt
* line have completed desc->threads_active becomes
* zero and the interrupt line is unmasked. See
* handle.c:irq_wake_thread() for further information.
*
* If no thread is woken by primary (hard irq context)
* interrupt handlers, then desc->threads_active is
* also checked for zero to unmask the irq line in the
* affected hard irq flow handlers
* (handle_[fasteoi|level]_irq).
*
* The new action gets the first zero bit of
* thread_mask assigned. See the loop above which or's
* all existing action->thread_mask bits.
*/
new->thread_mask = 1UL << ffz(thread_mask);
} else if (new->handler == irq_default_primary_handler &&
!(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
/*
* The interrupt was requested with handler = NULL, so
* we use the default primary handler for it. But it
* does not have the oneshot flag set. In combination
* with level interrupts this is deadly, because the
* default primary handler just wakes the thread, then
* the irq lines is reenabled, but the device still
* has the level irq asserted. Rinse and repeat....
*
* While this works for edge type interrupts, we play
* it safe and reject unconditionally because we can't
* say for sure which type this interrupt really
* has. The type flags are unreliable as the
* underlying chip implementation can override them.
*/
pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n",
new->name, irq);
ret = -EINVAL;
goto out_unlock;
}
if (!shared) {
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
ret = __irq_set_trigger(desc,
new->flags & IRQF_TRIGGER_MASK);
if (ret)
goto out_unlock;
}
/*
* Activate the interrupt. That activation must happen
* independently of IRQ_NOAUTOEN. request_irq() can fail
* and the callers are supposed to handle
* that. enable_irq() of an interrupt requested with
* IRQ_NOAUTOEN is not supposed to fail. The activation
* keeps it in shutdown mode, it merily associates
* resources if necessary and if that's not possible it
* fails. Interrupts which are in managed shutdown mode
* will simply ignore that activation request.
*/
ret = irq_activate(desc);
if (ret)
goto out_unlock;
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
IRQS_ONESHOT | IRQS_WAITING);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
if (new->flags & IRQF_PERCPU) {
irqd_set(&desc->irq_data, IRQD_PER_CPU);
irq_settings_set_per_cpu(desc);
if (new->flags & IRQF_NO_DEBUG)
irq_settings_set_no_debug(desc);
}
if (noirqdebug)
irq_settings_set_no_debug(desc);
if (new->flags & IRQF_ONESHOT)
desc->istate |= IRQS_ONESHOT;
/* Exclude IRQ from balancing if requested */
if (new->flags & IRQF_NOBALANCING) {
irq_settings_set_no_balancing(desc);
irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
}
if (!(new->flags & IRQF_NO_AUTOEN) &&
irq_settings_can_autoenable(desc)) {
irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
} else {
/*
* Shared interrupts do not go well with disabling
* auto enable. The sharing interrupt might request
* it while it's still disabled and then wait for
* interrupts forever.
*/
WARN_ON_ONCE(new->flags & IRQF_SHARED);
/* Undo nested disables: */
desc->depth = 1;
}
} else if (new->flags & IRQF_TRIGGER_MASK) {
unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
unsigned int omsk = irqd_get_trigger_type(&desc->irq_data);
if (nmsk != omsk)
/* hope the handler works with current trigger mode */
pr_warn("irq %d uses trigger mode %u; requested %u\n",
irq, omsk, nmsk);
}
*old_ptr = new;
irq_pm_install_action(desc, new);
/* Reset broken irq detection when installing new handler */
desc->irq_count = 0;
desc->irqs_unhandled = 0;
/*
* Check whether we disabled the irq via the spurious handler
* before. Reenable it and give it another chance.
*/
if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
desc->istate &= ~IRQS_SPURIOUS_DISABLED;
__enable_irq(desc);
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
chip_bus_sync_unlock(desc);
mutex_unlock(&desc->request_mutex);
irq_setup_timings(desc, new);
wake_up_and_wait_for_irq_thread_ready(desc, new);
wake_up_and_wait_for_irq_thread_ready(desc, new->secondary);
register_irq_proc(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
return 0;
}
return retval;
}
}
VS | standard kernel | PREEMPT_RT kernel |
---|---|---|
Interrupt bottom half | invokes softirq to handle the irqs | just wakes up ksoftirqd |
__local_bh_enable_ip | call do_softirq if not in irq ctx | call do_softirq in preemptable and irq enabled ctx otherwise wakeup ksoftirqd |
ksoftirqd prio | bh > RT > ksoftirqd == fair | bh > ksoftirqd == RT > fair |
ksoftirqd preemptable | preempted only by hard irq | fully preemptable |
mutual exclusion | by disabling bh | by disabling preemption |
softirq ctrl & preemption | diable softirq will disalbe preemption | decouple the binding |
soft interrupt execution point:
-
Interrupt bottom half
After the hard interrupt is processed, call irq_exit to exit. If it is detected that there is a soft interrupt to be processed, call invoke_softirq function to process the soft interrupt.
irq_exit_rcu() { preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); tick_irq_exit(); }
#ifdef CONFIG_PREEMPT_RT /* PREEMPT_RT kernel just wakes up softirqd */ static inline void invoke_softirq(void) { if (should_wake_ksoftirqd() { return !this_cpu_read(softirq_ctrl.cnt) }) { wakeup_softirqd(); } } #else /* standard kernel invokes softirq to handle the irqs */ static inline void invoke_softirq(void) { if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) { handle_softirqs(); } else { wakeup_softirqd(); } }
-
If the soft interrupt is not processed at the previous execution point, wake up ksoftirqd and process it in ksoftirqd. The ksoftirqd thread is a normal thread of the default priority fair scheduling class
handle_softirqs(void){ pending = local_softirq_pending(); h = softirq_vec; while ((softirq_bit = ffs(pending))) { h->action(h); /* soft irq handler, e.g., net_rx_action, net_tx_action */ h++; pending >>= softirq_bit; } pending = local_softirq_pending(); if (pending) { if (time_before(jiffies, end) && !need_resched() && --max_restart) goto restart; wakeup_softirqd(); } }
-
Some mutual exclusion mechanisms that close the bottom half (such as spin_lock_bh/spin_unlock_bh), when the outermost bottom half is enabled Call __local_bh_enable_ip function, if not in the interrupt context, call do_softirq() to process the soft interrupt
void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) { bool preempt_on = preemptible(); pending = local_softirq_pending(); if (!preempt_on) { wakeup_softirqd(); goto out; } cnt = SOFTIRQ_OFFSET; __local_bh_enable(cnt, false); handle_softirqs(); }
static struct smp_hotplug_thread softirq_threads = {
.store = &ksoftirqd,
.thread_should_run = ksoftirqd_should_run,
.thread_fn = run_ksoftirqd,
.thread_comm = "ksoftirqd/%u",
};
static __init int spawn_ksoftirqd(void)
{
cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, takeover_tasklets);
BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
return 0;
}
early_initcall(spawn_ksoftirqd);
/* kernel/smpboot.c */
static LIST_HEAD(hotplug_threads);
static DEFINE_MUTEX(smpboot_threads_lock);
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
unsigned int cpu;
int ret = 0;
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
if (ret) {
smpboot_destroy_threads(plug_thread);
goto out;
}
smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
out:
mutex_unlock(&smpboot_threads_lock);
put_online_cpus();
return ret;
}
static int
__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
{
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
struct smpboot_thread_data *td;
if (tsk)
return 0;
td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
if (!td)
return -ENOMEM;
td->cpu = cpu;
td->ht = ht;
tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu, ht->thread_comm);
if (IS_ERR(tsk)) {
kfree(td);
return PTR_ERR(tsk);
}
/* Park the thread so that it could start right on the CPU
* when it is available. */
kthread_park(tsk);
get_task_struct(tsk);
*per_cpu_ptr(ht->store, cpu) = tsk;
if (ht->create) {
/* Make sure that the task has actually scheduled out
* into park position, before calling the create
* callback. At least the migration thread callback
* requires that the task is off the runqueue. */
if (!wait_task_inactive(tsk, TASK_PARKED))
WARN_ON(1);
else
ht->create(cpu);
}
return 0;
}
struct task_struct *kthread_create_on_cpu(
int (*threadfn)(void *data),
void *data, unsigned int cpu,
const char *namefmt)
{
struct task_struct *p;
p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu);
if (IS_ERR(p))
return p;
kthread_bind(p, cpu);
/* CPU hotplug need to bind once again when unparking the thread. */
set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
to_kthread(p)->cpu = cpu;
return p;
}
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
}
void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
{
unsigned long flags;
if (!wait_task_inactive(p, state)) {
WARN_ON(1);
return;
}
/* It's safe because the task is inactive. */
raw_spin_lock_irqsave(&p->pi_lock, flags);
do_set_cpus_allowed(p, mask);
p->flags |= PF_NO_SETAFFINITY;
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
struct rq *rq = task_rq(p);
bool queued, running;
lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued) {
/* Because __kthread_bind() calls this on blocked tasks without
* holding rq->lock. */
lockdep_assert_held(&rq->lock);
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
put_prev_task(rq, p);
p->sched_class->set_cpus_allowed(p, new_mask);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_curr_task(rq, p);
}
const struct sched_class fair_sched_class = {
#ifdef CONFIG_SMP
.set_cpus_allowed = set_cpus_allowed_common,
#endif
};
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
{
cpumask_copy(&p->cpus_allowed, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
int smpboot_thread_fn(void *data)
{
struct smpboot_thread_data *td = data;
struct smp_hotplug_thread *ht = td->ht;
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
preempt_disable();
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
preempt_enable();
/* cleanup must mirror setup */
if (ht->cleanup && td->status != HP_THREAD_NONE)
ht->cleanup(td->cpu, cpu_online(td->cpu));
kfree(td);
return 0;
}
if (kthread_should_park()) {
__set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->park && td->status == HP_THREAD_ACTIVE) {
BUG_ON(td->cpu != smp_processor_id());
ht->park(td->cpu);
td->status = HP_THREAD_PARKED;
}
kthread_parkme();
/* We might have been woken for stop */
continue;
}
BUG_ON(td->cpu != smp_processor_id());
/* Check for state change setup */
switch (td->status) {
case HP_THREAD_NONE:
__set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->setup)
ht->setup(td->cpu);
td->status = HP_THREAD_ACTIVE;
continue;
case HP_THREAD_PARKED:
__set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->unpark)
ht->unpark(td->cpu);
td->status = HP_THREAD_ACTIVE;
continue;
}
if (!ht->thread_should_run(td->cpu)) {
preempt_enable_no_resched();
schedule();
} else {
__set_current_state(TASK_RUNNING);
preempt_enable();
ht->thread_fn(td->cpu); /* run_ksoftirqd */
}
}
}
static void run_ksoftirqd(unsigned int cpu)
{
ksoftirqd_run_begin() {
#ifdef CONFIG_PREEMPT_RT
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
local_irq_disable();
#else
local_irq_disable();
}
if (local_softirq_pending()) {
handle_softirqs(true);
ksoftirqd_run_end();
cond_resched();
return;
}
ksoftirqd_run_end();
}
struct softirq_action
{
void (*action)(struct softirq_action *);
};
void handle_softirqs(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
int max_restart = MAX_SOFTIRQ_RESTART;
struct softirq_action *h;
bool in_hardirq;
__u32 pending;
int softirq_bit;
/* Mask out PF_MEMALLOC s current task context is borrowed for the
* softirq. A softirq handled such as network RX might set PF_MEMALLOC
* again if the socket is related to swap */
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
account_irq_enter_time(current);
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
local_irq_enable();
h = softirq_vec;
while ((softirq_bit = ffs(pending))) {
unsigned int vec_nr;
int prev_count;
h += softirq_bit - 1;
vec_nr = h - softirq_vec;
prev_count = preempt_count();
kstat_incr_softirqs_this_cpu(vec_nr);
h->action(h); /* soft irq handler, e.g., net_rx_action, net_tx_action */
h++;
pending >>= softirq_bit;
}
rcu_bh_qs();
local_irq_disable();
pending = local_softirq_pending();
if (pending) {
if (time_before(jiffies, end) && !need_resched() && --max_restart)
goto restart;
wakeup_softirqd();
}
lockdep_softirq_end(in_hardirq);
account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
WARN_ON_ONCE(in_interrupt());
current_restore_flags(old_flags, PF_MEMALLOC);
}
void __init softirq_init(void)
{
int cpu;
for_each_possible_cpu(cpu) {
per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
}
open_softirq(TASKLET_SOFTIRQ, tasklet_action);
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}
enum
{
HI_SOFTIRQ=0,
TIMER_SOFTIRQ,
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
BLOCK_SOFTIRQ,
IRQ_POLL_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
HRTIMER_SOFTIRQ,
RCU_SOFTIRQ,
NR_SOFTIRQS
};
struct softirq_action
{
void (*action)(struct softirq_action *);
};
static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
void open_softirq(int nr, void (*action)(struct softirq_action *))
{
softirq_vec[nr].action = action;
}
To summarize, each softirq goes through the following stages:
- Registration of a softirq with the
open_softirq
function. - Activation of a softirq by marking it as deferred with the
raise_softirq
function. - After this, all marked softirqs will be triggered in the next time the Linux kernel schedules a round of executions of deferrable functions.
- And execution of the deferred functions that have the same type.
void raise_softirq_irqoff(unsigned int nr)
{
__raise_softirq_irqoff(nr) {
or_softirq_pending(1UL << nr);
}
if (!in_interrupt())
wakeup_softirqd();
}
static __always_inline int preempt_count(void)
{
return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
}
/* 1. per cpu data, non-zero means preemption is disabled
0-7 Preemption counter (max value = 255)
8-15 Softirq counter (max value = 255), indicate if soft IRQ processing is active,
while __softirq_pending shows which specific soft IRQs are pending
16-27 Hardirq counter (max value = 4096)
28 PREEMPT_ACTIVE flag */
DECLARE_PER_CPU(int, __preempt_count);
/* 2. per cpu bitmap that indicates which soft IRQs are pending for the CPU. */
typedef struct {
unsigned int __softirq_pending;
} ____cacheline_aligned irq_cpustat_t;
DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
/* 3. per task data */
struct task_struct {
struct thread_info {
unsigned long flags; /* TIF_SIGPENDING, TIF_NEED_RESCHED */
u64 ttbr0;
union {
u64 preempt_count; /* 0 => preemptible, <0 => bug */
struct {
u32 count;
u32 need_resched;
} preempt;
};
u32 cpu;
} thread_info;
};
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK | NMI_MASK))
#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count())
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
#define in_nmi() (preempt_count() & NMI_MASK)
#define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
void __raise_softirq_irqoff(unsigned int nr)
{
or_softirq_pending(1UL << nr);
}
#define local_softirq_pending_ref irq_stat.__softirq_pending
#define local_softirq_pending() (__this_cpu_read(local_softirq_pending_ref))
#define set_softirq_pending(x) (__this_cpu_write(local_softirq_pending_ref, (x)))
#define or_softirq_pending(x) (__this_cpu_or(local_softirq_pending_ref, (x)))
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK | NMI_MASK))
#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count())
void wakeup_softirqd(void)
{
struct task_struct *tsk = __this_cpu_read(ksoftirqd);
if (tsk && tsk->state != TASK_RUNNING)
wake_up_process(tsk);
}
static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
struct tasklet_head {
struct tasklet_struct *head;
struct tasklet_struct **tail;
};
struct tasklet_struct
{
struct tasklet_struct *next;
unsigned long state;
atomic_t count; /* 0 enabled, non-0 disabled */
void (*func)(unsigned long);
unsigned long data;
};
void __init softirq_init(void)
{
int cpu;
for_each_possible_cpu(cpu) {
per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
}
open_softirq(TASKLET_SOFTIRQ, tasklet_action);
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}
static __latent_entropy void tasklet_action(struct softirq_action *a)
{
tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
}
static void tasklet_action_common(struct softirq_action *a,
struct tasklet_head *tl_head,
unsigned int softirq_nr)
{
struct tasklet_struct *list;
local_irq_disable();
list = tl_head->head;
tl_head->head = NULL;
tl_head->tail = &tl_head->head;
local_irq_enable();
while (list) {
struct tasklet_struct *t = list;
list = list->next;
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
BUG();
t->func(t->data);
tasklet_unlock(t);
continue;
}
tasklet_unlock(t);
}
local_irq_disable();
t->next = NULL;
*tl_head->tail = t;
tl_head->tail = &t->next;
__raise_softirq_irqoff(softirq_nr);
local_irq_enable();
}
}
void tasklet_init(struct tasklet_struct *t,
void (*func)(unsigned long), unsigned long data)
{
t->next = NULL;
t->state = 0;
atomic_set(&t->count, 0);
t->func = func;
t->data = data;
}
enum {
TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */
TASKLET_STATE_RUN /* Tasklet is running (SMP only) */
};
void tasklet_schedule(struct tasklet_struct *t)
{
if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
__tasklet_schedule(t);
}
void __tasklet_schedule(struct tasklet_struct *t)
{
__tasklet_schedule_common(t, &tasklet_vec, TASKLET_SOFTIRQ);
}
void __tasklet_schedule_common(struct tasklet_struct *t,
struct tasklet_head __percpu *headp,
unsigned int softirq_nr)
{
struct tasklet_head *head;
unsigned long flags;
local_irq_save(flags);
head = this_cpu_ptr(headp);
t->next = NULL;
*head->tail = t;
head->tail = &(t->next);
raise_softirq_irqoff(softirq_nr);
local_irq_restore(flags);
}