The "Holy Bible" for embedded engineers
Performance optimization in embedded systems is critical due to limited resources, real-time constraints, and power requirements. This guide covers essential optimization techniques, profiling methods, and best practices for embedded software development.
Concept: Performance optimization in embedded systems is about making the most of limited resources while maintaining reliability and meeting real-time constraints. It’s not just about making code faster, but about understanding the trade-offs between speed, memory usage, power consumption, and code maintainability.
Why it matters: In embedded systems, performance directly affects battery life, responsiveness, and the ability to meet real-time deadlines. Poor performance can lead to missed deadlines, excessive power consumption, or system failures. Good optimization can enable new features, extend battery life, and improve user experience.
Minimal example: A simple loop optimization that demonstrates how loop unrolling can improve performance by reducing loop overhead and enabling better compiler optimizations.
Try it: Profile a simple embedded application to identify the biggest performance bottlenecks, then apply targeted optimizations and measure the actual improvement.
Takeaways: Performance optimization requires measurement, understanding of the target hardware, and careful consideration of trade-offs. The best optimizations often come from algorithmic improvements rather than micro-optimizations.
# GCC optimization flags for embedded systems
gcc -O2 -march=armv7-a -mtune=cortex-a7 -mfpu=neon -mfloat-abi=hard \
-ffast-math -funroll-loops -fomit-frame-pointer \
-fno-stack-protector -fno-common -fno-builtin \
-o optimized_program program.c
# ARM-specific optimizations
gcc -O3 -march=armv8-a -mtune=cortex-a53 \
-fno-stack-protector -fomit-frame-pointer \
-ffunction-sections -fdata-sections \
-Wl,--gc-sections -o program program.c
// Example: Optimization-aware code structure
// Use __attribute__ for compiler hints
__attribute__((hot)) void performance_critical_function(void) {
// This function is marked as frequently called
for (int i = 0; i < 1000; i++) {
process_data(i);
}
}
__attribute__((cold)) void rarely_called_function(void) {
// This function is marked as rarely called
log_debug_info();
}
// Force inline for small functions
static inline uint32_t fast_multiply(uint32_t a, uint32_t b) {
return a * b;
}
// Unoptimized loop
void unoptimized_loop(int *array, int size) {
for (int i = 0; i < size; i++) {
array[i] = array[i] * 2 + 1;
}
}
// Optimized loop with loop unrolling
void optimized_loop(int *array, int size) {
int i;
// Unroll by 4
for (i = 0; i < size - 3; i += 4) {
array[i] = array[i] * 2 + 1;
array[i+1] = array[i+1] * 2 + 1;
array[i+2] = array[i+2] * 2 + 1;
array[i+3] = array[i+3] * 2 + 1;
}
// Handle remaining elements
for (; i < size; i++) {
array[i] = array[i] * 2 + 1;
}
}
// SIMD optimization example
void simd_optimized_loop(int *array, int size) {
// Use ARM NEON instructions for vectorization
#ifdef __ARM_NEON
int32x4_t *vec_array = (int32x4_t*)array;
int32x4_t multiplier = vdupq_n_s32(2);
int32x4_t adder = vdupq_n_s32(1);
for (int i = 0; i < size/4; i++) {
int32x4_t data = vld1q_s32(&array[i*4]);
data = vmulq_s32(data, multiplier);
data = vaddq_s32(data, adder);
vst1q_s32(&array[i*4], data);
}
#endif
}
// Optimized data structure for cache locality
typedef struct {
uint32_t id;
uint32_t value;
uint8_t flags;
uint8_t padding[3]; // Align to 8-byte boundary
} __attribute__((packed)) optimized_struct_t;
// Cache-friendly array access
void cache_friendly_access(int *array, int rows, int cols) {
// Access data in row-major order for better cache performance
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
array[i * cols + j] = process_element(i, j);
}
}
}
// Avoid cache-unfriendly access patterns
void cache_unfriendly_access(int *array, int rows, int cols) {
// This pattern causes cache misses
for (int j = 0; j < cols; j++) {
for (int i = 0; i < rows; i++) {
array[i * cols + j] = process_element(i, j);
}
}
}
// Small functions should be inlined
static inline uint32_t bit_count(uint32_t x) {
x = x - ((x >> 1) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
x = (x + (x >> 4)) & 0x0f0f0f0f;
x = x + (x >> 8);
x = x + (x >> 16);
return x & 0x3f;
}
// Force inline for critical functions
static inline __attribute__((always_inline))
uint32_t critical_function(uint32_t input) {
return input * 7 + 13;
}
// Use function pointers for polymorphic behavior
typedef void (*process_func_t)(int *data, int size);
void process_data_optimized(int *data, int size, process_func_t func) {
// Avoid virtual function overhead
func(data, size);
}
// Example usage
void process_fast(int *data, int size) {
// Fast processing implementation
for (int i = 0; i < size; i++) {
data[i] *= 2;
}
}
void process_accurate(int *data, int size) {
// Accurate processing implementation
for (int i = 0; i < size; i++) {
data[i] = complex_calculation(data[i]);
}
}
// Unoptimized structure
typedef struct {
uint8_t a;
uint32_t b;
uint8_t c;
uint16_t d;
} unoptimized_struct_t; // Size: 12 bytes (4-byte alignment)
// Optimized structure
typedef struct {
uint32_t b; // 4 bytes
uint16_t d; // 2 bytes
uint8_t a; // 1 byte
uint8_t c; // 1 byte
} __attribute__((packed)) optimized_struct_t; // Size: 8 bytes
// Memory pool for fixed-size allocations
typedef struct {
uint8_t *pool;
uint32_t pool_size;
uint32_t block_size;
uint32_t free_blocks;
uint32_t *free_list;
} memory_pool_t;
// Initialize memory pool
int memory_pool_init(memory_pool_t *pool, uint32_t block_size, uint32_t num_blocks) {
pool->block_size = block_size;
pool->pool_size = block_size * num_blocks;
pool->free_blocks = num_blocks;
// Allocate pool memory
pool->pool = malloc(pool->pool_size);
if (!pool->pool) return -1;
// Initialize free list
pool->free_list = malloc(num_blocks * sizeof(uint32_t));
if (!pool->free_list) {
free(pool->pool);
return -1;
}
// Build free list
for (uint32_t i = 0; i < num_blocks; i++) {
pool->free_list[i] = i;
}
return 0;
}
// Allocate from pool
void* memory_pool_alloc(memory_pool_t *pool) {
if (pool->free_blocks == 0) return NULL;
uint32_t block_index = pool->free_list[--pool->free_blocks];
return pool->pool + (block_index * pool->block_size);
}
// Free to pool
void memory_pool_free(memory_pool_t *pool, void *ptr) {
if (!ptr) return;
uint32_t block_index = ((uint8_t*)ptr - pool->pool) / pool->block_size;
pool->free_list[pool->free_blocks++] = block_index;
}
// Monitor stack usage
typedef struct {
uint32_t max_stack_usage;
uint32_t current_stack_usage;
uint8_t *stack_start;
uint8_t *stack_end;
} stack_monitor_t;
// Initialize stack monitor
void stack_monitor_init(stack_monitor_t *monitor, uint8_t *stack_start, uint32_t stack_size) {
monitor->stack_start = stack_start;
monitor->stack_end = stack_start + stack_size;
monitor->max_stack_usage = 0;
monitor->current_stack_usage = 0;
// Fill stack with pattern for usage detection
for (uint32_t i = 0; i < stack_size; i++) {
stack_start[i] = 0xAA;
}
}
// Check stack usage
uint32_t stack_monitor_check_usage(stack_monitor_t *monitor) {
uint8_t *current_stack = (uint8_t*)¤t_stack;
uint32_t usage = monitor->stack_end - current_stack;
if (usage > monitor->max_stack_usage) {
monitor->max_stack_usage = usage;
}
return usage;
}
// Convert recursive function to iterative
// Recursive version (stack intensive)
int recursive_factorial(int n) {
if (n <= 1) return 1;
return n * recursive_factorial(n - 1);
}
// Iterative version (stack efficient)
int iterative_factorial(int n) {
int result = 1;
for (int i = 2; i <= n; i++) {
result *= i;
}
return result;
}
// Tail recursion optimization
int tail_recursive_factorial(int n, int acc) {
if (n <= 1) return acc;
return tail_recursive_factorial(n - 1, n * acc);
}
// Simple custom allocator for embedded systems
typedef struct {
uint8_t *heap_start;
uint8_t *heap_end;
uint8_t *current_ptr;
uint32_t total_allocated;
} simple_allocator_t;
// Initialize allocator
void simple_allocator_init(simple_allocator_t *alloc, uint8_t *heap_start, uint32_t heap_size) {
alloc->heap_start = heap_start;
alloc->heap_end = heap_start + heap_size;
alloc->current_ptr = heap_start;
alloc->total_allocated = 0;
}
// Allocate memory
void* simple_allocator_alloc(simple_allocator_t *alloc, uint32_t size) {
// Align to 4-byte boundary
size = (size + 3) & ~3;
if (alloc->current_ptr + size > alloc->heap_end) {
return NULL; // Out of memory
}
void *ptr = alloc->current_ptr;
alloc->current_ptr += size;
alloc->total_allocated += size;
return ptr;
}
// Reset allocator (for embedded systems that don't need individual free)
void simple_allocator_reset(simple_allocator_t *alloc) {
alloc->current_ptr = alloc->heap_start;
alloc->total_allocated = 0;
}
// CPU frequency scaling for power optimization
typedef enum {
CPU_FREQ_LOW = 0, // 100 MHz
CPU_FREQ_MEDIUM, // 400 MHz
CPU_FREQ_HIGH // 800 MHz
} cpu_freq_t;
// Set CPU frequency
int set_cpu_frequency(cpu_freq_t freq) {
switch (freq) {
case CPU_FREQ_LOW:
// Configure PLL for low frequency
configure_pll(100000000);
break;
case CPU_FREQ_MEDIUM:
configure_pll(400000000);
break;
case CPU_FREQ_HIGH:
configure_pll(800000000);
break;
default:
return -1;
}
// Update system clock
update_system_clock();
return 0;
}
// Adaptive frequency scaling
void adaptive_frequency_scaling(void) {
uint32_t cpu_usage = get_cpu_usage();
if (cpu_usage < 25) {
set_cpu_frequency(CPU_FREQ_LOW);
} else if (cpu_usage < 75) {
set_cpu_frequency(CPU_FREQ_MEDIUM);
} else {
set_cpu_frequency(CPU_FREQ_HIGH);
}
}
// Power management states
typedef enum {
POWER_STATE_ACTIVE,
POWER_STATE_IDLE,
POWER_STATE_SLEEP,
POWER_STATE_DEEP_SLEEP
} power_state_t;
// Enter sleep mode
void enter_sleep_mode(power_state_t state) {
switch (state) {
case POWER_STATE_IDLE:
// Disable CPU clock, keep peripherals active
disable_cpu_clock();
break;
case POWER_STATE_SLEEP:
// Disable most peripherals, keep RAM
disable_peripherals();
enter_sleep_mode();
break;
case POWER_STATE_DEEP_SLEEP:
// Disable everything except wake-up sources
disable_all_peripherals();
save_context();
enter_deep_sleep();
break;
default:
break;
}
}
// Wake-up handler
void wake_up_handler(void) {
// Restore context if needed
restore_context();
// Re-enable necessary peripherals
enable_peripherals();
// Resume normal operation
resume_operation();
}
// Peripheral clock management
typedef struct {
uint32_t peripheral_mask;
uint32_t clock_enabled;
} peripheral_power_t;
// Enable peripheral clock
void enable_peripheral_clock(uint32_t peripheral) {
// Set clock enable bit
PERIPHERAL_CLOCK_REG |= (1 << peripheral);
}
// Disable peripheral clock
void disable_peripheral_clock(uint32_t peripheral) {
// Clear clock enable bit
PERIPHERAL_CLOCK_REG &= ~(1 << peripheral);
}
// Power-aware peripheral usage
void power_aware_peripheral_usage(void) {
// Enable only when needed
enable_peripheral_clock(UART_CLOCK);
uart_transmit(data, size);
disable_peripheral_clock(UART_CLOCK);
// Use DMA for efficient data transfer
enable_peripheral_clock(DMA_CLOCK);
dma_transfer(source, destination, size);
// DMA will automatically disable when done
}
// Power-efficient interrupt handling
typedef struct {
uint32_t wake_up_sources;
uint32_t sleep_duration;
} power_config_t;
// Configure power management
void configure_power_management(power_config_t *config) {
// Enable wake-up sources
enable_wake_up_source(config->wake_up_sources);
// Set sleep duration
configure_sleep_timer(config->sleep_duration);
}
// Power-efficient main loop
void power_efficient_main_loop(void) {
while (1) {
// Process pending tasks
process_pending_tasks();
// Check if sleep is possible
if (can_enter_sleep()) {
// Enter sleep mode
enter_sleep_mode(POWER_STATE_SLEEP);
// Wait for interrupt
__WFI(); // Wait for interrupt
// Resume after wake-up
resume_after_wake_up();
}
}
}
// High-resolution timer for performance measurement
typedef struct {
uint32_t start_time;
uint32_t end_time;
uint32_t duration;
} performance_timer_t;
// Start timing
void timer_start(performance_timer_t *timer) {
timer->start_time = get_high_resolution_time();
}
// Stop timing
void timer_stop(performance_timer_t *timer) {
timer->end_time = get_high_resolution_time();
timer->duration = timer->end_time - timer->start_time;
}
// Get timing in microseconds
uint32_t timer_get_microseconds(performance_timer_t *timer) {
return timer->duration / (get_cpu_frequency() / 1000000);
}
// Example usage
void measure_performance(void) {
performance_timer_t timer;
timer_start(&timer);
performance_critical_function();
timer_stop(&timer);
printf("Function took %u microseconds\n", timer_get_microseconds(&timer));
}
// Real-time constraint checking
typedef struct {
uint32_t deadline;
uint32_t worst_case_time;
uint32_t actual_time;
} real_time_constraint_t;
// Check real-time constraint
int check_real_time_constraint(real_time_constraint_t *constraint) {
if (constraint->actual_time > constraint->deadline) {
// Real-time constraint violated
return -1;
}
// Check if we have enough margin
uint32_t margin = constraint->deadline - constraint->actual_time;
if (margin < constraint->worst_case_time * 0.1) {
// Warning: low margin
return 1;
}
return 0; // OK
}
// Real-time task execution
void real_time_task_execute(real_time_constraint_t *constraint) {
performance_timer_t timer;
timer_start(&timer);
// Execute real-time task
execute_real_time_task();
timer_stop(&timer);
constraint->actual_time = timer_get_microseconds(&timer);
// Check constraint
int result = check_real_time_constraint(constraint);
if (result < 0) {
// Handle constraint violation
handle_constraint_violation();
}
}
// Task scheduling information
typedef struct {
uint32_t task_id;
uint32_t priority;
uint32_t execution_time;
uint32_t period;
uint32_t deadline;
uint32_t missed_deadlines;
} task_info_t;
// Analyze task scheduling
void analyze_task_scheduling(task_info_t *tasks, int num_tasks) {
uint32_t total_utilization = 0;
for (int i = 0; i < num_tasks; i++) {
// Calculate utilization
uint32_t utilization = (tasks[i].execution_time * 100) / tasks[i].period;
total_utilization += utilization;
// Check for missed deadlines
if (tasks[i].missed_deadlines > 0) {
printf("Task %u missed %u deadlines\n",
tasks[i].task_id, tasks[i].missed_deadlines);
}
}
// Check total utilization
if (total_utilization > 100) {
printf("Warning: Total utilization exceeds 100%% (%u%%)\n", total_utilization);
}
}
// Function profiling structure
typedef struct {
char function_name[64];
uint32_t call_count;
uint32_t total_time;
uint32_t min_time;
uint32_t max_time;
uint32_t average_time;
} function_profile_t;
// Profiling macros
#define PROFILE_START(name) \
performance_timer_t _timer_##name; \
timer_start(&_timer_##name)
#define PROFILE_END(name) \
timer_stop(&_timer_##name); \
update_function_profile(#name, _timer_##name.duration)
// Update function profile
void update_function_profile(const char *name, uint32_t duration) {
function_profile_t *profile = find_or_create_profile(name);
if (profile) {
profile->call_count++;
profile->total_time += duration;
if (duration < profile->min_time || profile->min_time == 0) {
profile->min_time = duration;
}
if (duration > profile->max_time) {
profile->max_time = duration;
}
profile->average_time = profile->total_time / profile->call_count;
}
}
// Example usage
void profiled_function(void) {
PROFILE_START(profiled_function);
// Function implementation
for (int i = 0; i < 1000; i++) {
process_data(i);
}
PROFILE_END(profiled_function);
}
// Memory profiling
typedef struct {
uint32_t total_allocations;
uint32_t total_frees;
uint32_t current_usage;
uint32_t peak_usage;
uint32_t allocation_count;
} memory_profile_t;
static memory_profile_t memory_profile = {0};
// Memory allocation tracking
void* tracked_malloc(size_t size) {
void *ptr = malloc(size);
if (ptr) {
memory_profile.total_allocations += size;
memory_profile.current_usage += size;
memory_profile.allocation_count++;
if (memory_profile.current_usage > memory_profile.peak_usage) {
memory_profile.peak_usage = memory_profile.current_usage;
}
}
return ptr;
}
// Memory free tracking
void tracked_free(void *ptr) {
if (ptr) {
// Note: This is simplified - in practice you'd need to track allocation sizes
memory_profile.total_frees++;
memory_profile.allocation_count--;
}
free(ptr);
}
// Print memory profile
void print_memory_profile(void) {
printf("Memory Profile:\n");
printf(" Total allocations: %u bytes\n", memory_profile.total_allocations);
printf(" Current usage: %u bytes\n", memory_profile.current_usage);
printf(" Peak usage: %u bytes\n", memory_profile.peak_usage);
printf(" Allocation count: %u\n", memory_profile.allocation_count);
}
// Benchmark framework
typedef struct {
char benchmark_name[64];
uint32_t iterations;
uint32_t total_time;
uint32_t min_time;
uint32_t max_time;
uint32_t average_time;
} benchmark_t;
// Run benchmark
void run_benchmark(const char *name, void (*function)(void), uint32_t iterations) {
benchmark_t benchmark = {0};
strncpy(benchmark.benchmark_name, name, sizeof(benchmark.benchmark_name) - 1);
benchmark.iterations = iterations;
performance_timer_t timer;
for (uint32_t i = 0; i < iterations; i++) {
timer_start(&timer);
function();
timer_stop(&timer);
uint32_t duration = timer_get_microseconds(&timer);
benchmark.total_time += duration;
if (duration < benchmark.min_time || benchmark.min_time == 0) {
benchmark.min_time = duration;
}
if (duration > benchmark.max_time) {
benchmark.max_time = duration;
}
}
benchmark.average_time = benchmark.total_time / iterations;
// Print results
printf("Benchmark: %s\n", benchmark.benchmark_name);
printf(" Iterations: %u\n", benchmark.iterations);
printf(" Average time: %u us\n", benchmark.average_time);
printf(" Min time: %u us\n", benchmark.min_time);
printf(" Max time: %u us\n", benchmark.max_time);
printf(" Total time: %u us\n", benchmark.total_time);
}
# Using cppcheck for static analysis
cppcheck --enable=all --xml --xml-version=2 . 2> static_analysis.xml
# Using clang-tidy for additional checks
clang-tidy --checks=performance-* source_file.c
# Using gcc warnings
gcc -Wall -Wextra -Werror -O2 -o program program.c
# Using Valgrind for memory analysis
valgrind --leak-check=full --show-leak-kinds=all ./program
# Using AddressSanitizer
gcc -fsanitize=address -g -o program program.c
# Using perf for performance analysis
perf record ./program
perf report
# Using gprof for function profiling
gcc -pg -o program program.c
./program
gprof program gmon.out > profile.txt
# Using ftrace for kernel tracing
echo 1 > /sys/kernel/debug/tracing/tracing_on
./program
echo 0 > /sys/kernel/debug/tracing/tracing_on
cat /sys/kernel/debug/tracing/trace
Objective: Learn how loop unrolling can significantly improve performance by reducing loop overhead and enabling better compiler optimizations.
Steps:
Expected Outcome: A significant reduction in execution time for the performance-critical loop.
Objective: Understand how data structures can impact performance and cache locality.
Steps:
Expected Outcome: Reduced memory fragmentation, improved cache performance, and potentially better power consumption.
Objective: Learn how to manage power consumption in embedded systems.
Steps:
Expected Outcome: Reduced power consumption and potentially longer battery life.