The "Holy Bible" for embedded engineers
Integrating assembly code with C for low-level hardware control and optimization
Reach for inline/standalone assembly when you need exact instructions, special registers, or calling conventions C cannot provide. Keep interfaces small, stable, and documented.
// C wrapper with tiny asm core (example, ARM)
static inline uint32_t rbit32(uint32_t v){
uint32_t out; __asm volatile ("rbit %0, %1" : "=r"(out) : "r"(v)); return out;
}
rbit
intrinsic/asm..S
file preferable to inline asm?Embedded_C/Compiler_Intrinsics.md
Embedded_C/Type_Qualifiers.md
(for volatile
interactions)Assembly integration is essential in embedded systems for:
Assembly integration is the process of combining assembly language code with high-level C code to achieve low-level hardware control, performance optimization, and access to specific CPU features that may not be available through standard C constructs.
Low-level Control:
Performance Optimization:
Hardware Abstraction:
C Code (High-level):
// High-level C code - compiler generates assembly
uint32_t add_numbers(uint32_t a, uint32_t b) {
return a + b;
}
// Compiler-generated assembly (simplified):
// add r0, r0, r1
// bx lr
Assembly Code (Low-level):
// Direct assembly control
uint32_t add_numbers_asm(uint32_t a, uint32_t b) {
uint32_t result;
__asm volatile (
"add %0, %1, %2\n"
: "=r" (result)
: "r" (a), "r" (b)
);
return result;
}
Mixed Approach:
// C function with assembly for critical sections
void process_data(uint32_t* data, size_t size) {
// C code for setup
for (size_t i = 0; i < size; i++) {
// Assembly for performance-critical operation
__asm volatile (
"ldr r0, [%0]\n"
"add r0, r0, #1\n"
"str r0, [%0]\n"
: : "r" (&data[i]) : "r0"
);
}
}
Performance Critical Applications:
Hardware-Specific Operations:
Optimization Requirements:
Performance Improvements:
// C implementation - compiler optimized
uint32_t multiply_by_16_c(uint32_t value) {
// Modern compilers typically strength-reduce this to a shift automatically.
return value * 16;
}
// Assembly implementation - hand-optimized
uint32_t multiply_by_16_asm(uint32_t value) {
uint32_t result;
__asm volatile (
"lsl %0, %1, #4\n" // Logical shift left by 4 (multiply by 16)
: "=r" (result)
: "r" (value)
);
return result;
}
// Note: Compilers usually generate a shift for multiply-by-constant; hand-written
// asm is rarely faster for simple cases and may hinder optimization and portability.
Hardware Access:
// Direct hardware register access
// Guard ARM-specific inline assembly to avoid build errors on other targets
#if defined(__arm__) || defined(__aarch64__)
void enable_interrupts_asm(void) {
__asm volatile (
"cpsie i\n"
: : : "memory"
);
}
void disable_interrupts_asm(void) {
__asm volatile (
"cpsid i\n"
: : : "memory"
);
}
// Memory barrier for multi-core systems
void memory_barrier_asm(void) {
__asm volatile (
"dmb 0xF\n"
: : : "memory"
);
}
#endif
Interrupt Handling:
// Example interrupt service routine attribute is compiler/target-specific
void __attribute__((interrupt)) fast_isr(void) {
// Assembly for fast interrupt handling
__asm volatile (
"ldr r0, [%0]\n" // Load status register
"orr r0, r0, #1\n" // Set flag
"str r0, [%0]\n" // Store back
: : "r" (&status_register) : "r0"
);
}
High Impact Scenarios:
Low Impact Scenarios:
Inline Assembly Process:
Calling Conventions:
Register Allocation:
Inline Assembly:
Separate Assembly Files:
Mixed Approach:
Architecture-specific Code:
Compiler Support:
Inline assembly allows you to embed assembly language code directly within C functions. It provides a way to write performance-critical or hardware-specific code while maintaining the benefits of C programming.
Syntax and Structure:
Operand Binding:
// Basic inline assembly syntax
void simple_assembly_example(void) {
__asm volatile (
"mov r0, #42\n" // Load immediate value 42 into r0
"add r0, r0, #10\n" // Add 10 to r0
: // No output operands
: // No input operands
: "r0" // Clobbered registers
);
}
// Assembly with input/output operands
uint32_t add_with_assembly(uint32_t a, uint32_t b) {
uint32_t result;
__asm volatile (
"add %0, %1, %2\n" // Add r1 and r2, store in r0
: "=r" (result) // Output operand
: "r" (a), "r" (b) // Input operands
: // No clobbered registers
);
return result;
}
// Different constraint types
void constraint_examples(void) {
uint32_t value = 42;
uint32_t result;
// Register constraint
__asm volatile (
"mov %0, %1\n"
: "=r" (result) // Output in register
: "r" (value) // Input in register
);
// Memory constraint
__asm volatile (
"ldr %0, [%1]\n" // Load from memory
: "=r" (result) // Output in register
: "m" (value) // Input in memory
);
// Immediate constraint
__asm volatile (
"add %0, %1, #10\n" // Add immediate
: "=r" (result) // Output in register
: "r" (value), "I" (10) // Input register and immediate
);
}
// Complex assembly operation
uint32_t bit_reverse_assembly(uint32_t value) {
uint32_t result;
__asm volatile (
"rbit %0, %1\n" // Reverse bits
: "=r" (result)
: "r" (value)
);
return result;
}
// Multiple instructions
void multiple_instructions(void) {
uint32_t a = 10, b = 20, c = 30;
uint32_t result;
__asm volatile (
"add %0, %1, %2\n" // Add a and b
"mul %0, %0, %3\n" // Multiply by c
: "=r" (result)
: "r" (a), "r" (b), "r" (c)
: "cc" // Condition codes clobbered
);
}
// Conditional assembly based on compile-time constants
void conditional_assembly(void) {
uint32_t result;
#ifdef ARM_CORTEX_M4
__asm volatile (
"mov %0, #1\n" // Cortex-M4 specific
: "=r" (result)
);
#else
__asm volatile (
"mov %0, #0\n" // Other architectures
: "=r" (result)
);
#endif
}
Calling conventions define how functions pass parameters, return values, and manage the stack. They ensure compatibility between C and assembly code.
Parameter Passing:
Return Values:
Stack Management:
// ARM calling convention example
uint32_t arm_function(uint32_t a, uint32_t b, uint32_t c) {
// Parameters: r0, r1, r2
// Return value: r0
uint32_t result;
__asm volatile (
"add r0, r0, r1\n" // Add first two parameters
"add r0, r0, r2\n" // Add third parameter
"mov %0, r0\n" // Move result to output
: "=r" (result)
: "r" (a), "r" (b), "r" (c)
: "r0"
);
return result;
}
// Assembly function callable from C
__attribute__((naked)) void assembly_function(void) {
__asm volatile (
"push {lr}\n" // Save return address
"add r0, r0, r1\n" // Add parameters
"pop {lr}\n" // Restore return address
"bx lr\n" // Return
);
}
// ARM register usage
void register_usage_example(void) {
uint32_t a = 1, b = 2, c = 3, d = 4;
uint32_t result;
__asm volatile (
"mov r0, %1\n" // Load a into r0
"mov r1, %2\n" // Load b into r1
"mov r2, %3\n" // Load c into r2
"mov r3, %4\n" // Load d into r3
"add r0, r0, r1\n" // Add r0 and r1
"add r0, r0, r2\n" // Add r0 and r2
"add r0, r0, r3\n" // Add r0 and r3
"mov %0, r0\n" // Store result
: "=r" (result)
: "r" (a), "r" (b), "r" (c), "r" (d)
: "r0", "r1", "r2", "r3"
);
}
ARM assembly is the assembly language for ARM processors. It provides direct access to ARM-specific instructions and features.
Instruction Set:
Register Set:
Addressing Modes:
// Basic ARM assembly instructions
void basic_arm_instructions(void) {
uint32_t result;
__asm volatile (
"mov r0, #42\n" // Move immediate
"add r0, r0, #10\n" // Add immediate
"sub r0, r0, #5\n" // Subtract immediate
"mul r0, r0, #2\n" // Multiply
"mov %0, r0\n" // Move to output
: "=r" (result)
:
: "r0"
);
}
// ARM data processing instructions
void arm_data_processing(uint32_t a, uint32_t b) {
uint32_t result;
__asm volatile (
"add r0, %1, %2\n" // Add
"sub r1, %1, %2\n" // Subtract
"mul r2, %1, %2\n" // Multiply
"and r3, %1, %2\n" // AND
"orr r4, %1, %2\n" // OR
"eor r5, %1, %2\n" // XOR
"mov %0, r0\n" // Return sum
: "=r" (result)
: "r" (a), "r" (b)
: "r0", "r1", "r2", "r3", "r4", "r5"
);
}
// ARM memory operations
void arm_memory_operations(void) {
uint32_t data[4] = {1, 2, 3, 4};
uint32_t result;
__asm volatile (
"ldr r0, [%1]\n" // Load word
"ldr r1, [%1, #4]\n" // Load word with offset
"add r0, r0, r1\n" // Add loaded values
"str r0, [%1, #8]\n" // Store result
"mov %0, r0\n" // Return result
: "=r" (result)
: "r" (data)
: "r0", "r1", "memory"
);
}
Hardware access involves directly manipulating hardware registers and controlling hardware features through assembly code.
Register Access:
Hardware Control:
// Hardware register access
void hardware_register_access(void) {
volatile uint32_t* const GPIO_ODR = (uint32_t*)0x40020014;
volatile uint32_t* const GPIO_IDR = (uint32_t*)0x40020010;
uint32_t input_value, output_value;
__asm volatile (
"ldr r0, [%1]\n" // Load input register
"mov %0, r0\n" // Store input value
"orr r0, r0, #0x1000\n" // Set bit 12
"str r0, [%2]\n" // Store to output register
: "=r" (input_value)
: "r" (GPIO_IDR), "r" (GPIO_ODR)
: "r0", "memory"
);
}
// Interrupt control
void enable_interrupts_asm(void) {
__asm volatile (
"cpsie i\n" // Enable interrupts
"cpsie f\n" // Enable faults
: : : "memory"
);
}
void disable_interrupts_asm(void) {
__asm volatile (
"cpsid i\n" // Disable interrupts
"cpsid f\n" // Disable faults
: : : "memory"
);
}
// Memory barriers
void memory_barriers_asm(void) {
__asm volatile (
"dmb 0xF\n" // Data memory barrier
"dsb 0xF\n" // Data synchronization barrier
"isb 0xF\n" // Instruction synchronization barrier
: : : "memory"
);
}
Assembly performance depends on several factors including instruction selection, register usage, and memory access patterns.
Instruction Selection:
Register Usage:
Memory Access:
// Optimized assembly code
uint32_t optimized_multiply(uint32_t a, uint32_t b) {
uint32_t result;
__asm volatile (
"mul %0, %1, %2\n" // Single multiply instruction
: "=r" (result)
: "r" (a), "r" (b)
);
return result;
}
// Optimized bit manipulation
uint32_t optimized_bit_count(uint32_t value) {
uint32_t result;
__asm volatile (
"mov r0, %1\n" // Load value
"mov r1, #0\n" // Initialize counter
"1:\n" // Loop label
"cmp r0, #0\n" // Check if zero
"beq 2f\n" // Branch if zero
"sub r0, r0, #1\n" // Subtract 1
"and r0, r0, r0\n" // AND with itself
"add r1, r1, #1\n" // Increment counter
"b 1b\n" // Branch back
"2:\n" // End label
"mov %0, r1\n" // Store result
: "=r" (result)
: "r" (value)
: "r0", "r1"
);
return result;
}
// Optimized memory access
void optimized_memory_access(uint32_t* data, size_t size) {
__asm volatile (
"mov r0, %0\n" // Load data pointer
"mov r1, %1\n" // Load size
"1:\n" // Loop label
"cmp r1, #0\n" // Check if done
"beq 2f\n" // Branch if done
"ldr r2, [r0]\n" // Load data
"add r2, r2, #1\n" // Increment
"str r2, [r0]\n" // Store back
"add r0, r0, #4\n" // Next element
"sub r1, r1, #1\n" // Decrement counter
"b 1b\n" // Branch back
"2:\n" // End label
: : "r" (data), "r" (size)
: "r0", "r1", "r2", "memory"
);
}
Cross-platform assembly involves writing assembly code that works across different architectures and platforms while maintaining optimal performance.
Conditional Compilation:
Abstraction Layers:
// Architecture detection
#ifdef __arm__
#define ARCH_ARM 1
#elif defined(__x86_64__)
#define ARCH_X86_64 1
#elif defined(__i386__)
#define ARCH_X86 1
#else
#define ARCH_UNKNOWN 1
#endif
// Platform-specific assembly
void platform_specific_assembly(void) {
#ifdef ARCH_ARM
// ARM-specific assembly
__asm volatile (
"mov r0, #42\n"
: : : "r0"
);
#elif defined(ARCH_X86_64)
// x86_64-specific assembly
__asm volatile (
"mov $42, %%rax\n"
: : : "rax"
);
#else
// Fallback implementation
// Use C code or generic assembly
#endif
}
// Feature detection
#ifdef __ARM_NEON
#define HAS_NEON 1
#else
#define HAS_NEON 0
#endif
#ifdef __SSE2__
#define HAS_SSE2 1
#else
#define HAS_SSE2 0
#endif
// Feature-specific assembly
void feature_specific_assembly(void) {
#if HAS_NEON
// NEON SIMD assembly
__asm volatile (
"vadd.f32 q0, q0, q1\n"
: : : "q0", "q1"
);
#elif HAS_SSE2
// SSE2 SIMD assembly
__asm volatile (
"addps %%xmm0, %%xmm1\n"
: : : "xmm0", "xmm1"
);
#else
// Fallback implementation
#endif
}
#include <stdint.h>
#include <stdbool.h>
// Platform detection
#ifdef __arm__
#define PLATFORM_ARM 1
#else
#define PLATFORM_ARM 0
#endif
// Hardware register definitions
#define GPIOA_BASE 0x40020000
#define GPIOA_ODR (GPIOA_BASE + 0x14)
#define GPIOA_IDR (GPIOA_BASE + 0x10)
// Assembly function declarations
uint32_t add_assembly(uint32_t a, uint32_t b);
void enable_interrupts_assembly(void);
void disable_interrupts_assembly(void);
uint32_t bit_count_assembly(uint32_t value);
void memory_barrier_assembly(void);
// Inline assembly functions
inline uint32_t add_inline_assembly(uint32_t a, uint32_t b) {
uint32_t result;
__asm volatile (
"add %0, %1, %2\n"
: "=r" (result)
: "r" (a), "r" (b)
);
return result;
}
inline void gpio_set_pin_assembly(uint8_t pin) {
volatile uint32_t* const gpio_odr = (uint32_t*)GPIOA_ODR;
__asm volatile (
"ldr r0, [%0]\n"
"orr r0, r0, %1\n"
"str r0, [%0]\n"
: : "r" (gpio_odr), "r" (1 << pin)
: "r0", "memory"
);
}
inline void gpio_clear_pin_assembly(uint8_t pin) {
volatile uint32_t* const gpio_odr = (uint32_t*)GPIOA_ODR;
__asm volatile (
"ldr r0, [%0]\n"
"bic r0, r0, %1\n"
"str r0, [%0]\n"
: : "r" (gpio_odr), "r" (1 << pin)
: "r0", "memory"
);
}
inline bool gpio_read_pin_assembly(uint8_t pin) {
volatile uint32_t* const gpio_idr = (uint32_t*)GPIOA_IDR;
uint32_t result;
__asm volatile (
"ldr r0, [%1]\n"
"and r0, r0, %2\n"
"mov %0, r0\n"
: "=r" (result)
: "r" (gpio_idr), "r" (1 << pin)
: "r0"
);
return result != 0;
}
// Performance-critical assembly functions
uint32_t fast_multiply_assembly(uint32_t a, uint32_t b) {
uint32_t result;
__asm volatile (
"mul %0, %1, %2\n"
: "=r" (result)
: "r" (a), "r" (b)
);
return result;
}
uint32_t fast_divide_assembly(uint32_t a, uint32_t b) {
uint32_t result;
__asm volatile (
"udiv %0, %1, %2\n"
: "=r" (result)
: "r" (a), "r" (b)
);
return result;
}
// Interrupt control functions
void enable_interrupts_assembly(void) {
__asm volatile (
"cpsie i\n"
"cpsie f\n"
: : : "memory"
);
}
void disable_interrupts_assembly(void) {
__asm volatile (
"cpsid i\n"
"cpsid f\n"
: : : "memory"
);
}
// Memory barrier functions
void memory_barrier_assembly(void) {
__asm volatile (
"dmb 0xF\n"
"dsb 0xF\n"
"isb 0xF\n"
: : : "memory"
);
}
// Bit manipulation functions
uint32_t bit_count_assembly(uint32_t value) {
uint32_t result;
__asm volatile (
"mov r0, %1\n"
"mov r1, #0\n"
"1:\n"
"cmp r0, #0\n"
"beq 2f\n"
"sub r0, r0, #1\n"
"and r0, r0, r0\n"
"add r1, r1, #1\n"
"b 1b\n"
"2:\n"
"mov %0, r1\n"
: "=r" (result)
: "r" (value)
: "r0", "r1"
);
return result;
}
// Cross-platform assembly functions
void platform_specific_operation(void) {
#ifdef PLATFORM_ARM
__asm volatile (
"mov r0, #42\n"
"add r0, r0, #10\n"
: : : "r0"
);
#else
// Fallback implementation
// Use C code or generic assembly
#endif
}
// Main function
int main(void) {
// Test assembly functions
uint32_t result1 = add_inline_assembly(5, 3);
uint32_t result2 = fast_multiply_assembly(4, 6);
uint32_t result3 = bit_count_assembly(0x12345678);
// Test hardware access
gpio_set_pin_assembly(13);
bool button_state = gpio_read_pin_assembly(12);
gpio_clear_pin_assembly(13);
// Test interrupt control
disable_interrupts_assembly();
// Critical section
enable_interrupts_assembly();
// Test memory barriers
memory_barrier_assembly();
// Test platform-specific operations
platform_specific_operation();
return 0;
}
Problem: Wrong operand constraints causing incorrect code generation Solution: Use correct constraints and test thoroughly
// ❌ Bad: Incorrect constraints
uint32_t add_wrong(uint32_t a, uint32_t b) {
uint32_t result;
__asm volatile (
"add %0, %1, %2\n"
: "=r" (result)
: "r" (a), "r" (b)
: "r0" // Wrong: r0 not used
);
return result;
}
// ✅ Good: Correct constraints
uint32_t add_correct(uint32_t a, uint32_t b) {
uint32_t result;
__asm volatile (
"add %0, %1, %2\n"
: "=r" (result)
: "r" (a), "r" (b)
);
return result;
}
Problem: Compiler optimizing away assembly code Solution: Always use volatile for assembly blocks
// ❌ Bad: Missing volatile
void wrong_assembly(void) {
__asm (
"mov r0, #42\n"
: : : "r0"
);
}
// ✅ Good: Using volatile
void correct_assembly(void) {
__asm volatile (
"mov r0, #42\n"
: : : "r0"
);
}
Problem: Using registers that are already in use Solution: Understand calling conventions and register usage
// ❌ Bad: Using caller-saved registers without saving
void wrong_register_usage(uint32_t a, uint32_t b) {
__asm volatile (
"mov r0, %0\n" // r0 may be in use
"mov r1, %1\n" // r1 may be in use
: : "r" (a), "r" (b)
: "r0", "r1" // Must specify clobbered registers
);
}
// ✅ Good: Proper register usage
void correct_register_usage(uint32_t a, uint32_t b) {
__asm volatile (
"add r0, %0, %1\n"
: : "r" (a), "r" (b)
: "r0"
);
}
Problem: Code not portable across platforms Solution: Use conditional compilation and feature detection
// ❌ Bad: Platform-specific code
void platform_specific_wrong(void) {
__asm volatile (
"mov r0, #42\n" // ARM-specific
);
}
// ✅ Good: Platform-independent code
void platform_specific_correct(void) {
#ifdef __arm__
__asm volatile (
"mov r0, #42\n"
: : : "r0"
);
#elif defined(__x86_64__)
__asm volatile (
"mov $42, %%rax\n"
: : : "rax"
);
#else
// Fallback implementation
#endif
}
Next Steps: Explore Memory Models to understand memory layout, or dive into Advanced Memory Management for efficient memory management techniques.