The "Holy Bible" for embedded engineers
Performance optimization techniques using inline functions and macros in embedded C programming
Inline functions and macros are essential in embedded systems for:
Inline functions and macros are code optimization techniques that eliminate function call overhead by expanding code directly at the call site. They are particularly important in embedded systems where performance and code size are critical.
Function Call Overhead:
Code Expansion:
Optimization Strategies:
Traditional Function Call:
Call Site:
push parameter1
push parameter2
call function_name
add esp, 8 ; Clean up stack
mov result, eax ; Get return value
Function:
push ebp
mov ebp, esp
; Function body
mov eax, result
pop ebp
ret
Inline Expansion:
Call Site:
; Function body directly inserted here
mov eax, parameter1
add eax, parameter2
mov result, eax
Performance Critical Applications:
Resource Constraints:
Hardware Interaction:
Performance Improvements:
// Traditional function call (slower)
uint32_t add_numbers(uint32_t a, uint32_t b) {
return a + b;
}
// Inline function (faster)
inline uint32_t add_numbers_inline(uint32_t a, uint32_t b) {
return a + b;
}
// Usage in performance-critical loop
for (int i = 0; i < 1000000; i++) {
result += add_numbers_inline(i, 1); // No function call overhead
}
Code Size Optimization:
// Small frequently-used function
inline uint8_t get_lower_byte(uint32_t value) {
return (uint8_t)(value & 0xFF);
}
// Multiple call sites - code expanded at each location
uint8_t byte1 = get_lower_byte(data1);
uint8_t byte2 = get_lower_byte(data2);
uint8_t byte3 = get_lower_byte(data3);
Hardware Abstraction:
// Efficient hardware access
inline void led_on(void) {
*((volatile uint32_t*)0x40020014) |= (1 << 13);
}
inline void led_off(void) {
*((volatile uint32_t*)0x40020014) &= ~(1 << 13);
}
// Usage - direct hardware access without function call overhead
led_on(); // Expands to direct register manipulation
led_off(); // Expands to direct register manipulation
Use Inline Functions When:
Use Macros When:
Avoid When:
Compiler Decision Process:
Inlining Criteria:
Compiler Optimizations:
Preprocessor Phase:
Macro vs. Function:
Function Call Overhead:
Inline Expansion Benefits:
Inline functions are functions that the compiler may expand at the call site instead of generating a function call. They provide the benefits of macros (no function call overhead) while maintaining type safety and debugging support.
Compiler Hints:
Type Safety:
// Basic inline function
inline uint32_t square(uint32_t x) {
return x * x;
}
// Inline function with multiple parameters
inline uint32_t multiply_add(uint32_t a, uint32_t b, uint32_t c) {
return a * b + c;
}
// Usage
uint32_t result1 = square(5); // 25
uint32_t result2 = multiply_add(2, 3, 4); // 10
// Inline hardware register access
inline void gpio_set_pin(uint8_t pin) {
volatile uint32_t* const GPIO_SET = (uint32_t*)0x40020008;
*GPIO_SET = (1 << pin);
}
inline void gpio_clear_pin(uint8_t pin) {
volatile uint32_t* const GPIO_CLEAR = (uint32_t*)0x4002000C;
*GPIO_CLEAR = (1 << (pin + 16));
}
inline bool gpio_read_pin(uint8_t pin) {
volatile uint32_t* const GPIO_DATA = (uint32_t*)0x40020000;
return (*GPIO_DATA & (1 << pin)) != 0;
}
// Usage
gpio_set_pin(13); // Set LED pin
bool state = gpio_read_pin(12); // Read button state
// Force inline (GCC/Clang)
inline __attribute__((always_inline)) uint32_t fast_multiply(uint32_t a, uint32_t b) {
return a * b;
}
// Force inline (MSVC)
inline __forceinline uint32_t fast_multiply_msvc(uint32_t a, uint32_t b) {
return a * b;
}
// Cross-platform force inline
#ifdef __GNUC__
#define FORCE_INLINE inline __attribute__((always_inline))
#elif defined(_MSC_VER)
#define FORCE_INLINE __forceinline
#else
#define FORCE_INLINE inline
#endif
FORCE_INLINE uint32_t cross_platform_multiply(uint32_t a, uint32_t b) {
return a * b;
}
// Inline with specific optimization
inline __attribute__((always_inline, optimize("O3")))
uint32_t optimized_function(uint32_t x) {
return x * x + x + 1;
}
// Inline with no optimization (for debugging)
inline __attribute__((always_inline, optimize("O0")))
uint32_t debug_function(uint32_t x) {
return x * x + x + 1;
}
// Good candidate for inlining - small, frequently used
inline uint8_t get_upper_byte(uint32_t value) {
return (uint8_t)((value >> 8) & 0xFF);
}
// Good candidate - hardware access
inline void enable_interrupts(void) {
__asm__ volatile("cpsie i" : : : "memory");
}
// Good candidate - simple math
inline uint32_t min(uint32_t a, uint32_t b) {
return (a < b) ? a : b;
}
// Bad candidate - too large
inline void complex_algorithm(uint32_t* data, size_t size) {
// Complex algorithm with many lines of code
// Should not be inlined
}
Function-like macros are preprocessor directives that perform text substitution with parameters. They expand to code at the call site, eliminating function call overhead but without type safety.
Text Substitution:
Macro vs. Function:
// Basic function-like macro
#define SQUARE(x) ((x) * (x))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
// Usage
uint32_t result1 = SQUARE(5); // Expands to: ((5) * (5))
uint32_t result2 = MAX(10, 20); // Expands to: ((10) > (20) ? (10) : (20))
// Hardware register access macros
#define GPIO_SET_PIN(pin) \
(*((volatile uint32_t*)0x40020008) |= (1 << (pin)))
#define GPIO_CLEAR_PIN(pin) \
(*((volatile uint32_t*)0x4002000C) |= (1 << ((pin) + 16)))
#define GPIO_READ_PIN(pin) \
((*((volatile uint32_t*)0x40020000) & (1 << (pin))) != 0)
// Usage
GPIO_SET_PIN(13); // Set LED pin
bool state = GPIO_READ_PIN(12); // Read button state
// Multi-line macro with do-while(0)
#define INIT_DEVICE(device, id, config) \
do { \
(device)->id = (id); \
(device)->config = (config); \
(device)->status = DEVICE_INACTIVE; \
} while(0)
// Usage
device_t my_device;
INIT_DEVICE(&my_device, 1, 0x0F);
// Conditional compilation macros
#ifdef DEBUG
#define DEBUG_PRINT(msg) printf("DEBUG: %s\n", (msg))
#else
#define DEBUG_PRINT(msg) ((void)0)
#endif
// Platform-specific macros
#ifdef ARM_CORTEX_M4
#define CPU_FREQUENCY 168000000
#elif defined(ARM_CORTEX_M3)
#define CPU_FREQUENCY 72000000
#else
#define CPU_FREQUENCY 16000000
#endif
// Stringification - convert parameter to string
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
// Token pasting - concatenate tokens
#define CONCAT(a, b) a##b
// Usage
char* filename = TOSTRING(config.h); // Expands to: "config.h"
int var12 = CONCAT(var, 12); // Expands to: var12
// Safe macro with parentheses
#define SQUARE(x) ((x) * (x))
// Unsafe macro without parentheses
#define SQUARE_UNSAFE(x) x * x
// Usage examples
uint32_t a = 2, b = 3;
uint32_t result1 = SQUARE(a + b); // Expands to: ((a + b) * (a + b)) = 25
uint32_t result2 = SQUARE_UNSAFE(a + b); // Expands to: a + b * a + b = 11 (wrong!)
// Macro with multiple evaluation (unsafe)
#define MAX_UNSAFE(a, b) ((a) > (b) ? (a) : (b))
// Function with single evaluation (safe)
inline uint32_t max_safe(uint32_t a, uint32_t b) {
return (a > b) ? a : b;
}
// Usage with side effects
uint32_t counter = 0;
uint32_t result1 = MAX_UNSAFE(++counter, 5); // counter incremented twice!
uint32_t result2 = max_safe(++counter, 5); // counter incremented once
Conditional compilation allows different code to be compiled based on build-time conditions. It’s essential for creating portable code and optimizing for different platforms or build configurations.
Build-time Selection:
Common Use Cases:
// Debug configuration
#ifdef DEBUG
#define DEBUG_PRINT(msg) printf("DEBUG: %s\n", (msg))
#define ASSERT(condition) \
do { \
if (!(condition)) { \
printf("ASSERTION FAILED: %s, line %d\n", __FILE__, __LINE__); \
while(1); \
} \
} while(0)
#else
#define DEBUG_PRINT(msg) ((void)0)
#define ASSERT(condition) ((void)0)
#endif
// Usage
DEBUG_PRINT("Starting initialization");
ASSERT(device != NULL);
// Platform detection
#ifdef __arm__
#ifdef __ARM_ARCH_7M__
#define PLATFORM "ARM Cortex-M7"
#define CPU_FREQUENCY 216000000
#elif defined(__ARM_ARCH_7EM__)
#define PLATFORM "ARM Cortex-M7"
#define CPU_FREQUENCY 180000000
#elif defined(__ARM_ARCH_7M__)
#define PLATFORM "ARM Cortex-M3"
#define CPU_FREQUENCY 72000000
#else
#define PLATFORM "ARM (Unknown)"
#define CPU_FREQUENCY 16000000
#endif
#elif defined(__x86_64__)
#define PLATFORM "x86_64"
#define CPU_FREQUENCY 2400000000
#else
#define PLATFORM "Unknown"
#define CPU_FREQUENCY 16000000
#endif
// Feature configuration
#define FEATURE_UART 1
#define FEATURE_SPI 1
#define FEATURE_I2C 0
#define FEATURE_CAN 1
// Conditional compilation based on features
#if FEATURE_UART
void uart_init(void);
void uart_send_byte(uint8_t byte);
uint8_t uart_receive_byte(void);
#endif
#if FEATURE_SPI
void spi_init(void);
uint8_t spi_transfer(uint8_t data);
#endif
#if FEATURE_I2C
void i2c_init(void);
bool i2c_write(uint8_t address, uint8_t* data, uint8_t length);
#endif
Performance of inline functions and macros depends on several factors including compiler optimization, code size, and usage patterns.
Compiler Optimization:
Code Size Impact:
Usage Patterns:
// Optimize for performance
inline __attribute__((always_inline))
uint32_t fast_bit_count(uint32_t value) {
uint32_t count = 0;
while (value) {
count += value & 1;
value >>= 1;
}
return count;
}
// Optimize for size
inline __attribute__((always_inline))
uint32_t compact_bit_count(uint32_t value) {
return __builtin_popcount(value); // Use built-in function
}
// Optimized macro for bit operations
#define SET_BIT(reg, bit) ((reg) |= (1 << (bit)))
#define CLEAR_BIT(reg, bit) ((reg) &= ~(1 << (bit)))
#define TOGGLE_BIT(reg, bit) ((reg) ^= (1 << (bit)))
#define READ_BIT(reg, bit) (((reg) >> (bit)) & 1)
// Usage in performance-critical code
volatile uint32_t* const gpio_odr = (uint32_t*)0x40020014;
SET_BIT(*gpio_odr, 13); // Set LED pin
CLEAR_BIT(*gpio_odr, 13); // Clear LED pin
// Conditional optimization based on build type
#ifdef DEBUG
// Debug version - no optimization
inline uint32_t debug_multiply(uint32_t a, uint32_t b) {
printf("Multiplying %u by %u\n", a, b);
return a * b;
}
#else
// Release version - optimized
inline __attribute__((always_inline))
uint32_t debug_multiply(uint32_t a, uint32_t b) {
return a * b;
}
#endif
Debugging inline functions and macros requires special considerations because of how they’re processed by the compiler and preprocessor.
Inline Functions:
Macros:
// Inline function with debugging support
inline uint32_t debug_multiply(uint32_t a, uint32_t b) {
#ifdef DEBUG
printf("DEBUG: multiply(%u, %u)\n", a, b);
#endif
return a * b;
}
// Usage with debugging
uint32_t result = debug_multiply(5, 3); // Can set breakpoint here
// Macro with debugging (limited)
#define DEBUG_MULTIPLY(a, b) \
({ \
uint32_t _a = (a); \
uint32_t _b = (b); \
uint32_t _result = _a * _b; \
printf("DEBUG: multiply(%u, %u) = %u\n", _a, _b, _result); \
_result; \
})
// Usage (no breakpoint possible in macro)
uint32_t result = DEBUG_MULTIPLY(5, 3);
// Safe macro with type checking (limited)
#define SAFE_MULTIPLY(a, b) \
({ \
typeof(a) _a = (a); \
typeof(b) _b = (b); \
_a * _b; \
})
// Safer inline function with full type checking
inline uint32_t safe_multiply(uint32_t a, uint32_t b) {
return a * b;
}
#include <stdint.h>
#include <stdbool.h>
// Platform detection
#ifdef __arm__
#define PLATFORM_ARM 1
#else
#define PLATFORM_ARM 0
#endif
// Debug configuration
#ifdef DEBUG
#define DEBUG_PRINT(msg) printf("DEBUG: %s\n", (msg))
#define ASSERT(condition) \
do { \
if (!(condition)) { \
printf("ASSERTION FAILED: %s, line %d\n", __FILE__, __LINE__); \
while(1); \
} \
} while(0)
#else
#define DEBUG_PRINT(msg) ((void)0)
#define ASSERT(condition) ((void)0)
#endif
// Hardware register definitions
#define GPIOA_BASE 0x40020000
#define GPIOA_ODR (GPIOA_BASE + 0x14)
#define GPIOA_IDR (GPIOA_BASE + 0x10)
// Hardware access macros
#define GPIO_SET_PIN(pin) \
(*((volatile uint32_t*)GPIOA_ODR) |= (1 << (pin)))
#define GPIO_CLEAR_PIN(pin) \
(*((volatile uint32_t*)GPIOA_ODR) &= ~(1 << (pin)))
#define GPIO_READ_PIN(pin) \
((*((volatile uint32_t*)GPIOA_IDR) & (1 << (pin))) != 0)
// Inline functions for hardware access
inline void gpio_set_pin_inline(uint8_t pin) {
volatile uint32_t* const gpio_odr = (uint32_t*)GPIOA_ODR;
*gpio_odr |= (1 << pin);
}
inline void gpio_clear_pin_inline(uint8_t pin) {
volatile uint32_t* const gpio_odr = (uint32_t*)GPIOA_ODR;
*gpio_odr &= ~(1 << pin);
}
inline bool gpio_read_pin_inline(uint8_t pin) {
volatile uint32_t* const gpio_idr = (uint32_t*)GPIOA_IDR;
return (*gpio_idr & (1 << pin)) != 0;
}
// Performance-critical inline functions
inline __attribute__((always_inline))
uint32_t fast_multiply(uint32_t a, uint32_t b) {
return a * b;
}
inline __attribute__((always_inline))
uint32_t fast_add(uint32_t a, uint32_t b) {
return a + b;
}
// Conditional compilation based on platform
#if PLATFORM_ARM
inline void enable_interrupts(void) {
__asm__ volatile("cpsie i" : : : "memory");
}
inline void disable_interrupts(void) {
__asm__ volatile("cpsid i" : : : "memory");
}
#else
inline void enable_interrupts(void) {
// Platform-specific implementation
}
inline void disable_interrupts(void) {
// Platform-specific implementation
}
#endif
// Debugging support
inline uint32_t debug_multiply(uint32_t a, uint32_t b) {
DEBUG_PRINT("Performing multiplication");
uint32_t result = a * b;
DEBUG_PRINT("Multiplication complete");
return result;
}
// Main function
int main(void) {
DEBUG_PRINT("Starting application");
// Use macros for hardware access
GPIO_SET_PIN(13); // Set LED pin
bool button_state = GPIO_READ_PIN(12); // Read button state
// Use inline functions for performance-critical operations
uint32_t result1 = fast_multiply(5, 3);
uint32_t result2 = fast_add(10, 20);
// Use conditional compilation
enable_interrupts();
// Use debugging support
uint32_t debug_result = debug_multiply(4, 6);
ASSERT(result1 == 15);
ASSERT(result2 == 30);
DEBUG_PRINT("Application complete");
return 0;
}
Problem: Macros can cause unexpected side effects Solution: Use parentheses and avoid multiple evaluation
// ❌ Bad: Macro with side effects
#define SQUARE(x) x * x
#define MAX(a, b) a > b ? a : b
// Usage
uint32_t result1 = SQUARE(2 + 3); // Expands to: 2 + 3 * 2 + 3 = 11 (wrong!)
uint32_t counter = 0;
uint32_t result2 = MAX(++counter, 5); // counter incremented twice!
// ✅ Good: Safe macro with parentheses
#define SQUARE(x) ((x) * (x))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
// ✅ Better: Use inline function
inline uint32_t square(uint32_t x) {
return x * x;
}
inline uint32_t max(uint32_t a, uint32_t b) {
return (a > b) ? a : b;
}
Problem: Large functions being inlined Solution: Only inline small, frequently-used functions
// ❌ Bad: Large inline function
inline void complex_algorithm(uint32_t* data, size_t size) {
// 50+ lines of complex code
// Should not be inlined
}
// ✅ Good: Small inline function
inline uint32_t get_upper_byte(uint32_t value) {
return (uint32_t)((value >> 8) & 0xFF);
}
Problem: Inline functions and macros can be difficult to debug Solution: Use appropriate debugging strategies
// ❌ Bad: No debugging support
#define HARDWARE_ACCESS(addr, value) (*((volatile uint32_t*)(addr)) = (value))
// ✅ Good: Debugging support
inline void hardware_access(uint32_t addr, uint32_t value) {
#ifdef DEBUG
printf("Writing 0x%08X to address 0x%08X\n", value, addr);
#endif
*((volatile uint32_t*)addr) = value;
}
Problem: Code not portable across platforms Solution: Use conditional compilation
// ❌ Bad: Platform-specific code
inline void enable_interrupts(void) {
__asm__ volatile("cpsie i" : : : "memory"); // ARM-specific
}
// ✅ Good: Platform-independent code
#ifdef __arm__
inline void enable_interrupts(void) {
__asm__ volatile("cpsie i" : : : "memory");
}
#elif defined(__x86_64__)
inline void enable_interrupts(void) {
__asm__ volatile("sti");
}
#else
inline void enable_interrupts(void) {
// Platform-specific implementation
}
#endif
Next Steps: Explore Compiler Intrinsics to understand hardware-specific operations, or dive into Assembly Integration for low-level programming techniques.