The Embedded New Testament

The "Holy Bible" for embedded engineers


Project maintained by theEmbeddedGeorge Hosted on GitHub Pages — Theme by mattgraham

Shared Memory Programming

📋 Table of Contents

🎯 Overview

Shared memory programming enables multiple cores or processes to communicate through common memory regions. In embedded systems, this is crucial for efficient inter-core communication, data sharing, and parallel processing while maintaining data consistency and avoiding race conditions.

🔧 Shared Memory Basics

Shared Memory Structure

// Basic shared memory structure
typedef struct {
    volatile uint32_t flag;
    uint32_t data[100];
    volatile uint32_t read_index;
    volatile uint32_t write_index;
    uint32_t buffer_size;
} __attribute__((aligned(64))) shared_buffer_t;

// Shared memory region definition
typedef struct {
    shared_buffer_t* buffer;
    uint32_t core_id;
    bool is_initialized;
} shared_memory_context_t;

// Initialize shared memory
shared_memory_context_t* create_shared_memory_context(uint32_t core_id) {
    shared_memory_context_t* context = malloc(sizeof(shared_memory_context_t));
    if (!context) return NULL;
    
    // Allocate shared memory (in practice, this would be a specific memory region)
    context->buffer = (shared_buffer_t*)SHARED_MEMORY_BASE_ADDRESS;
    context->core_id = core_id;
    context->is_initialized = false;
    
    return context;
}

void initialize_shared_buffer(shared_buffer_t* buffer) {
    if (!buffer) return;
    
    buffer->flag = 0;
    buffer->read_index = 0;
    buffer->write_index = 0;
    buffer->buffer_size = 100;
    
    // Clear data buffer
    memset(buffer->data, 0, sizeof(buffer->data));
}

Memory Region Mapping

// Memory region mapping for shared access
typedef struct {
    void* virtual_address;
    uint32_t physical_address;
    size_t size;
    uint32_t permissions;
    bool is_shared;
} memory_region_t;

memory_region_t* map_shared_memory_region(uint32_t physical_addr, 
                                         size_t size, 
                                         uint32_t permissions) {
    memory_region_t* region = malloc(sizeof(memory_region_t));
    if (!region) return NULL;
    
    // Map physical memory to virtual address
    region->physical_address = physical_addr;
    region->virtual_address = map_physical_memory(physical_addr, size, permissions);
    region->size = size;
    region->permissions = permissions;
    region->is_shared = true;
    
    return region;
}

void* map_physical_memory(uint32_t physical_addr, size_t size, uint32_t permissions) {
    // Platform-specific memory mapping
    // This would use MMU or memory mapping functions
    return (void*)physical_addr;  // Simplified for demonstration
}

void unmap_shared_memory_region(memory_region_t* region) {
    if (region && region->virtual_address) {
        unmap_physical_memory(region->virtual_address, region->size);
        free(region);
    }
}

🔒 Memory Synchronization

Mutex-based Synchronization

// Shared memory mutex
typedef struct {
    volatile uint32_t lock;
    uint32_t owner_core;
    uint32_t lock_count;
} __attribute__((aligned(64))) shared_mutex_t;

// Initialize shared mutex
void init_shared_mutex(shared_mutex_t* mutex) {
    mutex->lock = 0;
    mutex->owner_core = 0xFFFFFFFF;
    mutex->lock_count = 0;
}

// Acquire shared mutex
bool acquire_shared_mutex(shared_mutex_t* mutex, uint32_t core_id) {
    uint32_t expected = 0;
    uint32_t desired = 1;
    
    // Try to acquire lock using compare-and-swap
    if (__sync_bool_compare_and_swap(&mutex->lock, expected, desired)) {
        mutex->owner_core = core_id;
        mutex->lock_count++;
        return true;
    }
    
    return false;
}

// Release shared mutex
void release_shared_mutex(shared_mutex_t* mutex, uint32_t core_id) {
    if (mutex->owner_core == core_id) {
        mutex->lock_count--;
        if (mutex->lock_count == 0) {
            mutex->owner_core = 0xFFFFFFFF;
            mutex->lock = 0;
        }
    }
}

// Example usage with shared buffer
bool write_to_shared_buffer(shared_buffer_t* buffer, 
                          shared_mutex_t* mutex, 
                          uint32_t data, 
                          uint32_t core_id) {
    if (!acquire_shared_mutex(mutex, core_id)) {
        return false;
    }
    
    // Write data to shared buffer
    if (buffer->write_index < buffer->buffer_size) {
        buffer->data[buffer->write_index] = data;
        buffer->write_index++;
        release_shared_mutex(mutex, core_id);
        return true;
    }
    
    release_shared_mutex(mutex, core_id);
    return false;
}

Semaphore-based Synchronization

// Shared semaphore structure
typedef struct {
    volatile uint32_t count;
    volatile uint32_t max_count;
    uint32_t waiting_cores;
} __attribute__((aligned(64))) shared_semaphore_t;

// Initialize shared semaphore
void init_shared_semaphore(shared_semaphore_t* sem, uint32_t initial_count, uint32_t max_count) {
    sem->count = initial_count;
    sem->max_count = max_count;
    sem->waiting_cores = 0;
}

// Wait on shared semaphore
bool wait_shared_semaphore(shared_semaphore_t* sem, uint32_t core_id) {
    uint32_t expected;
    uint32_t desired;
    
    do {
        expected = sem->count;
        if (expected == 0) {
            return false;  // Semaphore not available
        }
        desired = expected - 1;
    } while (!__sync_bool_compare_and_swap(&sem->count, expected, desired));
    
    return true;
}

// Signal shared semaphore
bool signal_shared_semaphore(shared_semaphore_t* sem) {
    uint32_t expected;
    uint32_t desired;
    
    do {
        expected = sem->count;
        if (expected >= sem->max_count) {
            return false;  // Semaphore at maximum
        }
        desired = expected + 1;
    } while (!__sync_bool_compare_and_swap(&sem->count, expected, desired));
    
    return true;
}

🔄 Lock-free Programming

Lock-free Queue Implementation

// Lock-free queue structure
typedef struct {
    volatile uint32_t head;
    volatile uint32_t tail;
    uint32_t size;
    uint32_t* data;
} __attribute__((aligned(64))) lock_free_queue_t;

lock_free_queue_t* create_lock_free_queue(uint32_t size) {
    lock_free_queue_t* queue = malloc(sizeof(lock_free_queue_t));
    if (!queue) return NULL;
    
    queue->data = aligned_alloc(64, size * sizeof(uint32_t));
    if (!queue->data) {
        free(queue);
        return NULL;
    }
    
    queue->head = 0;
    queue->tail = 0;
    queue->size = size;
    
    return queue;
}

// Enqueue operation
bool lock_free_enqueue(lock_free_queue_t* queue, uint32_t value) {
    uint32_t tail = queue->tail;
    uint32_t next_tail = (tail + 1) % queue->size;
    
    if (next_tail == queue->head) {
        return false;  // Queue full
    }
    
    queue->data[tail] = value;
    queue->tail = next_tail;
    return true;
}

// Dequeue operation
bool lock_free_dequeue(lock_free_queue_t* queue, uint32_t* value) {
    uint32_t head = queue->head;
    
    if (head == queue->tail) {
        return false;  // Queue empty
    }
    
    *value = queue->data[head];
    queue->head = (head + 1) % queue->size;
    return true;
}

Lock-free Stack Implementation

// Lock-free stack structure
typedef struct node {
    uint32_t data;
    struct node* next;
} lock_free_stack_node_t;

typedef struct {
    volatile lock_free_stack_node_t* top;
} lock_free_stack_t;

lock_free_stack_t* create_lock_free_stack(void) {
    lock_free_stack_t* stack = malloc(sizeof(lock_free_stack_t));
    if (stack) {
        stack->top = NULL;
    }
    return stack;
}

// Push operation
bool lock_free_push(lock_free_stack_t* stack, uint32_t value) {
    lock_free_stack_node_t* new_node = malloc(sizeof(lock_free_stack_node_t));
    if (!new_node) return false;
    
    new_node->data = value;
    
    lock_free_stack_node_t* old_top;
    do {
        old_top = stack->top;
        new_node->next = old_top;
    } while (!__sync_bool_compare_and_swap(&stack->top, old_top, new_node));
    
    return true;
}

// Pop operation
bool lock_free_pop(lock_free_stack_t* stack, uint32_t* value) {
    lock_free_stack_node_t* old_top;
    lock_free_stack_node_t* new_top;
    
    do {
        old_top = stack->top;
        if (!old_top) {
            return false;  // Stack empty
        }
        new_top = old_top->next;
    } while (!__sync_bool_compare_and_swap(&stack->top, old_top, new_top));
    
    *value = old_top->data;
    free(old_top);
    return true;
}

🚧 Memory Barriers

Memory Barrier Types

// Memory barrier functions
void full_memory_barrier(void) {
    __sync_synchronize();  // Full memory barrier
}

void read_memory_barrier(void) {
    __asm volatile ("dmb ishld" : : : "memory");  // Read barrier
}

void write_memory_barrier(void) {
    __asm volatile ("dmb ishst" : : : "memory");  // Write barrier
}

// Compiler barrier
void compiler_barrier(void) {
    __asm volatile ("" : : : "memory");
}

// Memory barrier usage in shared memory
typedef struct {
    volatile uint32_t data;
    volatile uint32_t sequence;
} __attribute__((aligned(64))) barrier_protected_data_t;

void write_with_barrier(barrier_protected_data_t* shared_data, uint32_t new_data) {
    shared_data->data = new_data;
    write_memory_barrier();  // Ensure write is visible
    shared_data->sequence++;
    full_memory_barrier();   // Ensure sequence update is visible
}

uint32_t read_with_barrier(barrier_protected_data_t* shared_data) {
    uint32_t sequence = shared_data->sequence;
    read_memory_barrier();   // Ensure we read latest sequence
    uint32_t data = shared_data->data;
    full_memory_barrier();   // Ensure we read after sequence
    return data;
}

Atomic Operations with Barriers

// Atomic operations with memory barriers
uint32_t atomic_exchange_with_barrier(volatile uint32_t* ptr, uint32_t value) {
    uint32_t result = __sync_lock_test_and_set(ptr, value);
    full_memory_barrier();
    return result;
}

uint32_t atomic_compare_exchange_with_barrier(volatile uint32_t* ptr, 
                                             uint32_t expected, 
                                             uint32_t desired) {
    uint32_t result = __sync_val_compare_and_swap(ptr, expected, desired);
    full_memory_barrier();
    return result;
}

// Example: Atomic counter with barriers
typedef struct {
    volatile uint32_t counter;
    volatile uint32_t version;
} __attribute__((aligned(64))) atomic_counter_t;

void atomic_increment_with_barrier(atomic_counter_t* counter) {
    uint32_t old_value;
    uint32_t new_value;
    
    do {
        old_value = counter->counter;
        new_value = old_value + 1;
    } while (!atomic_compare_exchange_with_barrier(&counter->counter, 
                                                  old_value, new_value));
    
    counter->version++;
    write_memory_barrier();
}

🔄 Cache Coherency

Cache Line Alignment

// Cache line aligned shared data
#define CACHE_LINE_SIZE 64

typedef struct {
    uint32_t core1_data;
    char padding1[CACHE_LINE_SIZE - 4];
    uint32_t core2_data;
    char padding2[CACHE_LINE_SIZE - 4];
    uint32_t core3_data;
    char padding3[CACHE_LINE_SIZE - 4];
    uint32_t core4_data;
    char padding4[CACHE_LINE_SIZE - 4];
} __attribute__((aligned(CACHE_LINE_SIZE))) cache_aligned_shared_data_t;

// Prevent false sharing
typedef struct {
    uint32_t frequently_accessed;
    char padding[CACHE_LINE_SIZE - 4];
} __attribute__((aligned(CACHE_LINE_SIZE))) isolated_shared_data_t;

// Cache-aware shared memory allocation
void* allocate_cache_aligned_shared_memory(size_t size) {
    size_t aligned_size = ((size + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE) * CACHE_LINE_SIZE;
    return aligned_alloc(CACHE_LINE_SIZE, aligned_size);
}

Cache Flush and Invalidate

// Cache operations for shared memory
void flush_cache_for_shared_write(void* address, size_t size) {
    __builtin___clear_cache((char*)address, (char*)address + size);
}

void invalidate_cache_for_shared_read(void* address, size_t size) {
    __builtin___clear_cache((char*)address, (char*)address + size);
}

// Shared memory with cache management
typedef struct {
    void* data;
    size_t size;
    bool needs_flush;
    bool needs_invalidate;
} cache_managed_shared_memory_t;

cache_managed_shared_memory_t* create_cache_managed_shared_memory(size_t size) {
    cache_managed_shared_memory_t* shared_mem = malloc(sizeof(cache_managed_shared_memory_t));
    if (!shared_mem) return NULL;
    
    shared_mem->data = allocate_cache_aligned_shared_memory(size);
    shared_mem->size = size;
    shared_mem->needs_flush = true;
    shared_mem->needs_invalidate = true;
    
    return shared_mem;
}

void prepare_shared_memory_for_write(cache_managed_shared_memory_t* shared_mem) {
    if (shared_mem->needs_flush) {
        flush_cache_for_shared_write(shared_mem->data, shared_mem->size);
    }
}

void prepare_shared_memory_for_read(cache_managed_shared_memory_t* shared_mem) {
    if (shared_mem->needs_invalidate) {
        invalidate_cache_for_shared_read(shared_mem->data, shared_mem->size);
    }
}

🔄 Multi-core Communication

Core-to-Core Communication

// Inter-core communication structure
typedef struct {
    volatile uint32_t message;
    volatile uint32_t sender_core;
    volatile uint32_t receiver_core;
    volatile uint32_t sequence;
} __attribute__((aligned(64))) inter_core_message_t;

typedef struct {
    inter_core_message_t messages[MAX_CORES];
    volatile uint32_t message_count;
} inter_core_communication_t;

// Send message to another core
bool send_message_to_core(inter_core_communication_t* comm, 
                         uint32_t target_core, 
                         uint32_t message, 
                         uint32_t sender_core) {
    if (target_core >= MAX_CORES) return false;
    
    inter_core_message_t* msg = &comm->messages[target_core];
    msg->message = message;
    msg->sender_core = sender_core;
    msg->receiver_core = target_core;
    msg->sequence++;
    
    // Trigger interrupt on target core
    trigger_core_interrupt(target_core);
    
    return true;
}

// Receive message from another core
bool receive_message_from_core(inter_core_communication_t* comm, 
                              uint32_t core_id, 
                              uint32_t* message, 
                              uint32_t* sender_core) {
    inter_core_message_t* msg = &comm->messages[core_id];
    
    if (msg->receiver_core == core_id && msg->message != 0) {
        *message = msg->message;
        *sender_core = msg->sender_core;
        msg->message = 0;  // Clear message
        return true;
    }
    
    return false;
}

Shared Memory Ring Buffer

// Ring buffer for inter-core communication
typedef struct {
    volatile uint32_t head;
    volatile uint32_t tail;
    volatile uint32_t size;
    volatile uint32_t* buffer;
} __attribute__((aligned(64))) shared_ring_buffer_t;

shared_ring_buffer_t* create_shared_ring_buffer(uint32_t size) {
    shared_ring_buffer_t* ring_buffer = malloc(sizeof(shared_ring_buffer_t));
    if (!ring_buffer) return NULL;
    
    ring_buffer->buffer = aligned_alloc(64, size * sizeof(uint32_t));
    if (!ring_buffer->buffer) {
        free(ring_buffer);
        return NULL;
    }
    
    ring_buffer->head = 0;
    ring_buffer->tail = 0;
    ring_buffer->size = size;
    
    return ring_buffer;
}

// Producer function (called by one core)
bool ring_buffer_produce(shared_ring_buffer_t* ring_buffer, uint32_t data) {
    uint32_t next_head = (ring_buffer->head + 1) % ring_buffer->size;
    
    if (next_head == ring_buffer->tail) {
        return false;  // Buffer full
    }
    
    ring_buffer->buffer[ring_buffer->head] = data;
    ring_buffer->head = next_head;
    return true;
}

// Consumer function (called by another core)
bool ring_buffer_consume(shared_ring_buffer_t* ring_buffer, uint32_t* data) {
    if (ring_buffer->head == ring_buffer->tail) {
        return false;  // Buffer empty
    }
    
    *data = ring_buffer->buffer[ring_buffer->tail];
    ring_buffer->tail = (ring_buffer->tail + 1) % ring_buffer->size;
    return true;
}

⏱️ Real-time Considerations

Real-time Shared Memory

// Real-time shared memory with deadlines
typedef struct {
    volatile uint32_t data;
    volatile uint32_t timestamp;
    volatile uint32_t deadline;
    volatile uint32_t core_id;
} __attribute__((aligned(64))) real_time_shared_data_t;

// Real-time shared memory access
bool real_time_shared_write(real_time_shared_data_t* shared_data, 
                           uint32_t data, 
                           uint32_t deadline, 
                           uint32_t core_id) {
    uint32_t current_time = get_system_tick_count();
    
    if (current_time > deadline) {
        return false;  // Deadline missed
    }
    
    shared_data->data = data;
    shared_data->timestamp = current_time;
    shared_data->deadline = deadline;
    shared_data->core_id = core_id;
    
    return true;
}

bool real_time_shared_read(real_time_shared_data_t* shared_data, 
                          uint32_t* data, 
                          uint32_t* timestamp) {
    uint32_t current_time = get_system_tick_count();
    
    if (current_time > shared_data->deadline) {
        return false;  // Data expired
    }
    
    *data = shared_data->data;
    *timestamp = shared_data->timestamp;
    
    return true;
}

Priority-based Shared Memory

// Priority-based shared memory access
typedef enum {
    PRIORITY_LOW = 0,
    PRIORITY_MEDIUM = 1,
    PRIORITY_HIGH = 2,
    PRIORITY_CRITICAL = 3
} shared_memory_priority_t;

typedef struct {
    volatile uint32_t data;
    volatile shared_memory_priority_t priority;
    volatile uint32_t access_count;
} __attribute__((aligned(64))) priority_shared_data_t;

bool priority_shared_write(priority_shared_data_t* shared_data, 
                          uint32_t data, 
                          shared_memory_priority_t priority) {
    // Check if we can preempt current access
    if (priority > shared_data->priority) {
        shared_data->data = data;
        shared_data->priority = priority;
        shared_data->access_count++;
        return true;
    }
    
    return false;  // Lower priority access blocked
}

⚠️ Common Pitfalls

1. Race Conditions

// WRONG: Race condition in shared memory access
void incorrect_shared_access(volatile uint32_t* shared_counter) {
    uint32_t value = *shared_counter;  // Read
    value++;                           // Modify
    *shared_counter = value;           // Write
    // Race condition between read and write
}

// CORRECT: Atomic operation
void correct_shared_access(volatile uint32_t* shared_counter) {
    __sync_fetch_and_add(shared_counter, 1);  // Atomic increment
}

2. False Sharing

// WRONG: False sharing between cores
typedef struct {
    uint32_t core1_counter;  // May be in same cache line as core2_counter
    uint32_t core2_counter;
} incorrect_shared_counters_t;

// CORRECT: Cache line separation
typedef struct {
    uint32_t core1_counter;
    char padding1[60];  // Padding to next cache line
    uint32_t core2_counter;
    char padding2[60];
} __attribute__((aligned(64))) correct_shared_counters_t;

3. Missing Memory Barriers

// WRONG: Missing memory barriers
void incorrect_shared_write(volatile uint32_t* data, volatile uint32_t* flag) {
    *data = 0x12345678;
    *flag = 1;  // May be reordered with data write
}

// CORRECT: With memory barriers
void correct_shared_write(volatile uint32_t* data, volatile uint32_t* flag) {
    *data = 0x12345678;
    write_memory_barrier();  // Ensure data write is visible
    *flag = 1;
}

✅ Best Practices

1. Shared Memory Design Patterns

// Producer-consumer pattern with shared memory
typedef struct {
    volatile uint32_t* buffer;
    volatile uint32_t head;
    volatile uint32_t tail;
    volatile uint32_t size;
    shared_mutex_t* mutex;
} producer_consumer_shared_t;

producer_consumer_shared_t* create_producer_consumer_shared(uint32_t buffer_size) {
    producer_consumer_shared_t* pc = malloc(sizeof(producer_consumer_shared_t));
    if (!pc) return NULL;
    
    pc->buffer = aligned_alloc(64, buffer_size * sizeof(uint32_t));
    pc->head = 0;
    pc->tail = 0;
    pc->size = buffer_size;
    pc->mutex = malloc(sizeof(shared_mutex_t));
    
    if (pc->buffer && pc->mutex) {
        init_shared_mutex(pc->mutex);
        return pc;
    }
    
    // Cleanup on failure
    free(pc->buffer);
    free(pc->mutex);
    free(pc);
    return NULL;
}

bool producer_put(producer_consumer_shared_t* pc, uint32_t data, uint32_t core_id) {
    if (!acquire_shared_mutex(pc->mutex, core_id)) {
        return false;
    }
    
    uint32_t next_head = (pc->head + 1) % pc->size;
    if (next_head == pc->tail) {
        release_shared_mutex(pc->mutex, core_id);
        return false;  // Buffer full
    }
    
    pc->buffer[pc->head] = data;
    pc->head = next_head;
    
    release_shared_mutex(pc->mutex, core_id);
    return true;
}

bool consumer_get(producer_consumer_shared_t* pc, uint32_t* data, uint32_t core_id) {
    if (!acquire_shared_mutex(pc->mutex, core_id)) {
        return false;
    }
    
    if (pc->head == pc->tail) {
        release_shared_mutex(pc->mutex, core_id);
        return false;  // Buffer empty
    }
    
    *data = pc->buffer[pc->tail];
    pc->tail = (pc->tail + 1) % pc->size;
    
    release_shared_mutex(pc->mutex, core_id);
    return true;
}

2. Shared Memory Validation

// Validate shared memory access
bool validate_shared_memory_access(void* address, size_t size) {
    // Check alignment
    if ((uintptr_t)address % 64 != 0) {
        return false;
    }
    
    // Check size alignment
    if (size % 64 != 0) {
        return false;
    }
    
    // Check if address is in shared memory region
    if ((uintptr_t)address < SHARED_MEMORY_BASE || 
        (uintptr_t)address >= SHARED_MEMORY_END) {
        return false;
    }
    
    return true;
}

// Safe shared memory allocation
void* safe_shared_memory_alloc(size_t size) {
    size_t aligned_size = ((size + 63) / 64) * 64;  // Align to 64 bytes
    
    void* ptr = aligned_alloc(64, aligned_size);
    if (!ptr) return NULL;
    
    if (!validate_shared_memory_access(ptr, aligned_size)) {
        free(ptr);
        return NULL;
    }
    
    return ptr;
}

3. Shared Memory Monitoring

// Monitor shared memory usage
typedef struct {
    uint32_t access_count;
    uint32_t conflict_count;
    uint32_t last_access_time;
    uint32_t last_access_core;
} shared_memory_monitor_t;

shared_memory_monitor_t* create_shared_memory_monitor(void) {
    shared_memory_monitor_t* monitor = malloc(sizeof(shared_memory_monitor_t));
    if (monitor) {
        monitor->access_count = 0;
        monitor->conflict_count = 0;
        monitor->last_access_time = 0;
        monitor->last_access_core = 0;
    }
    return monitor;
}

void update_shared_memory_monitor(shared_memory_monitor_t* monitor, 
                                 uint32_t core_id, 
                                 bool conflict) {
    monitor->access_count++;
    monitor->last_access_time = get_system_tick_count();
    monitor->last_access_core = core_id;
    
    if (conflict) {
        monitor->conflict_count++;
    }
}

🎯 Interview Questions

Basic Questions

  1. What is shared memory programming and why is it important?
    • Enables multiple cores/processes to communicate through common memory
    • Important for efficient inter-core communication and data sharing
  2. What are the main challenges of shared memory programming?
    • Race conditions
    • Cache coherency issues
    • Memory ordering requirements
    • False sharing
  3. How do you prevent race conditions in shared memory?
    • Use atomic operations
    • Implement proper synchronization (mutexes, semaphores)
    • Use memory barriers
    • Design lock-free data structures

Advanced Questions

  1. How would you implement a lock-free data structure?
    • Use atomic compare-and-swap operations
    • Design for single-writer or multiple-reader scenarios
    • Implement proper memory ordering
  2. What are the trade-offs between lock-based and lock-free programming?
    • Lock-based: Simpler, but may cause contention
    • Lock-free: Better performance, but more complex
  3. How would you optimize shared memory for real-time systems?
    • Minimize access latency
    • Use appropriate memory barriers
    • Implement deadline-aware access patterns

📚 Additional Resources

Standards and Documentation

Tools and Libraries


Next Topic: Real-time SystemsOperating Systems