The "Holy Bible" for embedded engineers
Using built-in functions for hardware-specific operations and optimizations
They promise the compiler a specific operation; when supported, you get a single instruction, otherwise a correct fallback. Guard for architecture, and always keep a portable path.
// Measure vs loop implementation
static inline uint32_t popcnt_loop(uint32_t v){ uint32_t c=0; while(v){c+=v&1u; v>>=1;} return c; }
static inline uint32_t popcnt_intrin(uint32_t v){ return __builtin_popcount(v); }
-O0
and -O2
on your target.#ifdef
and provide a loop fallback to keep portability.count_bits
three ways (loop, table, intrinsic); benchmark and inspect code size.Embedded_C/Assembly_Integration.md
for when to drop to asmEmbedded_C/Bit_Manipulation.md
for POPCNT use casesCompiler intrinsics are built-in functions that provide:
Compiler intrinsics are built-in functions provided by the compiler that map directly to specific CPU instructions. They offer a high-level interface to low-level hardware operations, enabling developers to write optimized code without using assembly language.
Hardware Abstraction:
Direct Instruction Mapping:
Optimization Benefits:
C Code (High-level):
// Standard C code - compiler may optimize
uint32_t count_bits(uint32_t value) {
uint32_t count = 0;
while (value) {
count += value & 1;
value >>= 1;
}
return count;
}
Intrinsic (Optimized):
// Intrinsic - maps to specific CPU instruction
uint32_t count_bits_intrinsic(uint32_t value) {
// Maps to a target-specific instruction when available.
// On ARM Cortex-M, this may compile to CLZ/POPCNT sequences if supported.
return __builtin_popcount(value);
}
Assembly (Low-level):
// Assembly - direct CPU instruction
uint32_t count_bits_asm(uint32_t value) {
uint32_t result;
__asm__ volatile("popcnt %1, %0" : "=r"(result) : "r"(value));
return result;
}
Performance Critical Applications:
Hardware-Specific Operations:
Optimization Requirements:
Performance Improvements:
// Standard C implementation - slower
uint32_t count_bits_standard(uint32_t value) {
uint32_t count = 0;
for (int i = 0; i < 32; i++) {
if (value & (1 << i)) count++;
}
return count;
}
// Intrinsic implementation - much faster
uint32_t count_bits_intrinsic(uint32_t value) {
return __builtin_popcount(value); // Single CPU instruction
}
// Performance comparison
// Standard: ~32 iterations + conditional branches
// Intrinsic: 1 CPU instruction (POPCNT)
Hardware Feature Access:
// Access to hardware-specific features
// ARM-specific intrinsics (GCC/Clang). Guard to avoid non-ARM builds failing.
#if defined(__arm__) || defined(__aarch64__)
void enable_interrupts(void) {
__builtin_arm_cpsie_i();
}
void disable_interrupts(void) {
__builtin_arm_cpsid_i();
}
// Memory barriers for ordered I/O and SMP (on MCUs without SMP, still useful for I/O ordering)
void memory_barrier(void) {
__builtin_arm_dmb(0xF);
}
#endif
Cross-platform Compatibility:
// Platform-independent intrinsic usage
uint32_t count_bits_platform_independent(uint32_t value) {
#ifdef __GNUC__
return __builtin_popcount(value);
#elif defined(_MSC_VER)
return __popcnt(value);
#else
// Fallback implementation
uint32_t count = 0;
while (value) {
count += value & 1;
value >>= 1;
}
return count;
#endif
}
High Impact Scenarios:
Low Impact Scenarios:
Compiler Processing:
Instruction Selection:
Performance Characteristics:
Bit Manipulation:
Memory Operations:
Mathematical Operations:
Hardware Control:
GCC/Clang Support:
MSVC Support:
Cross-platform Strategies:
GCC intrinsics are built-in functions provided by the GNU Compiler Collection that offer direct access to CPU instructions and hardware features. They provide a high-level interface to low-level operations.
Built-in Functions:
Instruction Mapping:
// Population count (count set bits)
uint32_t count_bits_gcc(uint32_t value) {
return __builtin_popcount(value);
}
uint32_t count_bits_gcc_ll(uint64_t value) {
return __builtin_popcountll(value);
}
// Find first set bit (count trailing zeros)
uint32_t find_first_set_bit_gcc(uint32_t value) {
if (value == 0) return 32;
return __builtin_ctz(value);
}
uint32_t find_first_set_bit_gcc_ll(uint64_t value) {
if (value == 0) return 64;
return __builtin_ctzll(value);
}
// Find last set bit (count leading zeros)
uint32_t find_last_set_bit_gcc(uint32_t value) {
if (value == 0) return 32;
return 31 - __builtin_clz(value);
}
uint32_t find_last_set_bit_gcc_ll(uint64_t value) {
if (value == 0) return 64;
return 63 - __builtin_clzll(value);
}
// Check for overflow in arithmetic operations
bool add_overflow_check(uint32_t a, uint32_t b, uint32_t* result) {
return __builtin_add_overflow(a, b, result);
}
bool sub_overflow_check(uint32_t a, uint32_t b, uint32_t* result) {
return __builtin_sub_overflow(a, b, result);
}
bool mul_overflow_check(uint32_t a, uint32_t b, uint32_t* result) {
return __builtin_mul_overflow(a, b, result);
}
// Usage
uint32_t result;
if (add_overflow_check(0xFFFFFFFF, 1, &result)) {
// Overflow occurred
printf("Overflow detected!\n");
}
// Byte order conversion intrinsics
uint16_t swap_bytes_16(uint16_t value) {
return __builtin_bswap16(value);
}
uint32_t swap_bytes_32(uint32_t value) {
return __builtin_bswap32(value);
}
uint64_t swap_bytes_64(uint64_t value) {
return __builtin_bswap64(value);
}
// Usage
uint32_t network_value = 0x12345678;
uint32_t host_value = __builtin_bswap32(network_value);
// Type conversion intrinsics
float int_to_float(int value) {
return __builtin_convertvector(value, float);
}
int float_to_int(float value) {
return __builtin_convertvector(value, int);
}
// Usage
int int_value = 42;
float float_value = int_to_float(int_value);
// Memory barrier intrinsics
void full_memory_barrier(void) {
__builtin_arm_dmb(0xF); // Full system memory barrier
}
void data_memory_barrier(void) {
__builtin_arm_dmb(0xE); // Data memory barrier
}
void instruction_memory_barrier(void) {
__builtin_arm_isb(0xF); // Instruction synchronization barrier
}
// Usage in multi-core systems
void atomic_operation(void) {
// Perform atomic operation
atomic_value = new_value;
// Ensure memory ordering
data_memory_barrier();
}
// Cache operation intrinsics
void cache_clean(void* address, size_t size) {
__builtin_arm_dccmvac(address, address + size);
}
void cache_invalidate(void* address, size_t size) {
__builtin_arm_dcimvac(address, address + size);
}
void cache_clean_and_invalidate(void* address, size_t size) {
__builtin_arm_dccimvac(address, address + size);
}
// Usage for DMA operations
void prepare_dma_buffer(void* buffer, size_t size) {
// Clean cache before DMA read
cache_clean(buffer, size);
}
ARM intrinsics are built-in functions specifically designed for ARM processors that provide access to ARM-specific instructions and features. They offer optimized implementations for ARM architectures.
ARM-specific Features:
Instruction Sets:
// System control intrinsics
void enable_interrupts_arm(void) {
__builtin_arm_cpsie_i(); // Enable interrupts
}
void disable_interrupts_arm(void) {
__builtin_arm_cpsid_i(); // Disable interrupts
}
void enable_faults_arm(void) {
__builtin_arm_cpsie_f(); // Enable faults
}
void disable_faults_arm(void) {
__builtin_arm_cpsid_f(); // Disable faults
}
// Usage
void critical_section(void) {
disable_interrupts_arm();
// Critical code here
enable_interrupts_arm();
}
// ARM-specific bit manipulation
uint32_t count_leading_zeros_arm(uint32_t value) {
return __builtin_arm_clz(value);
}
uint32_t count_trailing_zeros_arm(uint32_t value) {
return __builtin_arm_ctz(value);
}
uint32_t population_count_arm(uint32_t value) {
return __builtin_arm_popcount(value);
}
// Usage
uint32_t value = 0x12345678;
uint32_t leading_zeros = count_leading_zeros_arm(value);
uint32_t trailing_zeros = count_trailing_zeros_arm(value);
uint32_t set_bits = population_count_arm(value);
// ARM memory operation intrinsics
void data_memory_barrier_arm(void) {
__builtin_arm_dmb(0xE); // Data memory barrier
}
void instruction_sync_barrier_arm(void) {
__builtin_arm_isb(0xF); // Instruction synchronization barrier
}
void data_sync_barrier_arm(void) {
__builtin_arm_dsb(0xE); // Data synchronization barrier
}
// Usage for multi-core synchronization
void synchronize_cores(void) {
data_memory_barrier_arm();
instruction_sync_barrier_arm();
}
Bit manipulation intrinsics provide efficient implementations of common bit operations that map to specific CPU instructions. They offer significant performance improvements over standard C implementations.
Common Operations:
Performance Benefits:
// Population count - count set bits
uint32_t popcount_standard(uint32_t value) {
uint32_t count = 0;
while (value) {
count += value & 1;
value >>= 1;
}
return count;
}
uint32_t popcount_intrinsic(uint32_t value) {
return __builtin_popcount(value); // Single instruction
}
uint32_t popcount_64_intrinsic(uint64_t value) {
return __builtin_popcountll(value); // 64-bit version
}
// Usage
uint32_t test_value = 0x12345678;
uint32_t bit_count = popcount_intrinsic(test_value);
// Count leading zeros (find first set bit from MSB)
uint32_t clz_standard(uint32_t value) {
if (value == 0) return 32;
uint32_t count = 0;
while (!(value & 0x80000000)) {
count++;
value <<= 1;
}
return count;
}
uint32_t clz_intrinsic(uint32_t value) {
return __builtin_clz(value); // Single instruction
}
// Count trailing zeros (find first set bit from LSB)
uint32_t ctz_standard(uint32_t value) {
if (value == 0) return 32;
uint32_t count = 0;
while (!(value & 1)) {
count++;
value >>= 1;
}
return count;
}
uint32_t ctz_intrinsic(uint32_t value) {
return __builtin_ctz(value); // Single instruction
}
// Usage
uint32_t value = 0x00080000; // Bit 19 set
uint32_t leading_zeros = clz_intrinsic(value); // 11
uint32_t trailing_zeros = ctz_intrinsic(value); // 19
// Bit reversal - reverse bit order
uint32_t bit_reverse_standard(uint32_t value) {
uint32_t result = 0;
for (int i = 0; i < 32; i++) {
if (value & (1 << i)) {
result |= (1 << (31 - i));
}
}
return result;
}
uint32_t bit_reverse_intrinsic(uint32_t value) {
return __builtin_bitreverse32(value); // Single instruction
}
// Usage
uint32_t original = 0x12345678;
uint32_t reversed = bit_reverse_intrinsic(original);
Memory barrier intrinsics provide control over memory access ordering in multi-core and multi-threaded systems. They ensure proper synchronization and prevent memory ordering issues.
Memory Ordering:
Barrier Types:
// ARM memory barrier intrinsics
void full_memory_barrier_arm(void) {
__builtin_arm_dmb(0xF); // Full system memory barrier
}
void data_memory_barrier_arm(void) {
__builtin_arm_dmb(0xE); // Data memory barrier
}
void instruction_sync_barrier_arm(void) {
__builtin_arm_isb(0xF); // Instruction synchronization barrier
}
void data_sync_barrier_arm(void) {
__builtin_arm_dsb(0xE); // Data synchronization barrier
}
// Usage in multi-core systems
void atomic_operation_arm(void) {
// Perform atomic operation
atomic_value = new_value;
// Ensure memory ordering
data_memory_barrier_arm();
}
// GCC memory barrier intrinsics
void full_memory_barrier_gcc(void) {
__sync_synchronize(); // Full memory barrier
}
void load_memory_barrier_gcc(void) {
__builtin_arm_dmb(0xE); // Load memory barrier
}
void store_memory_barrier_gcc(void) {
__builtin_arm_dmb(0xE); // Store memory barrier
}
// Usage for thread synchronization
void thread_synchronization(void) {
// Thread 1: Write data
shared_data = new_data;
store_memory_barrier_gcc();
// Thread 2: Read data
load_memory_barrier_gcc();
data = shared_data;
}
SIMD (Single Instruction, Multiple Data) intrinsics provide access to vector processing instructions that can operate on multiple data elements simultaneously. They offer significant performance improvements for data-parallel operations.
Vector Processing:
Performance Benefits:
// ARM NEON SIMD intrinsics
#include <arm_neon.h>
// Vector addition
uint32x4_t vector_add_neon(uint32x4_t a, uint32x4_t b) {
return vaddq_u32(a, b); // Add 4 32-bit elements
}
// Vector multiplication
uint32x4_t vector_mul_neon(uint32x4_t a, uint32x4_t b) {
return vmulq_u32(a, b); // Multiply 4 32-bit elements
}
// Vector load and store
void vector_operations_neon(uint32_t* data, size_t size) {
for (size_t i = 0; i < size; i += 4) {
// Load 4 elements
uint32x4_t vec = vld1q_u32(&data[i]);
// Process vector
vec = vaddq_u32(vec, vdupq_n_u32(1));
// Store 4 elements
vst1q_u32(&data[i], vec);
}
}
// Cross-platform SIMD abstraction
#ifdef __ARM_NEON
#include <arm_neon.h>
#define VECTOR_ADD(a, b) vaddq_u32(a, b)
#define VECTOR_MUL(a, b) vmulq_u32(a, b)
#elif defined(__SSE2__)
#include <emmintrin.h>
#define VECTOR_ADD(a, b) _mm_add_epi32(a, b)
#define VECTOR_MUL(a, b) _mm_mullo_epi32(a, b)
#else
// Fallback implementation
#define VECTOR_ADD(a, b) /* fallback implementation */
#define VECTOR_MUL(a, b) /* fallback implementation */
#endif
Intrinsic performance depends on several factors including hardware support, compiler optimization, and usage patterns.
Hardware Support:
Compiler Optimization:
Usage Patterns:
// Optimal intrinsic usage for performance
void optimized_bit_operations(uint32_t* data, size_t size) {
for (size_t i = 0; i < size; i++) {
// Use intrinsics for optimal performance
data[i] = __builtin_popcount(data[i]);
}
}
// Vectorized operations
void vectorized_operations(uint32_t* data, size_t size) {
#ifdef __ARM_NEON
for (size_t i = 0; i < size; i += 4) {
uint32x4_t vec = vld1q_u32(&data[i]);
vec = vaddq_u32(vec, vdupq_n_u32(1));
vst1q_u32(&data[i], vec);
}
#else
for (size_t i = 0; i < size; i++) {
data[i] += 1;
}
#endif
}
// Optimized memory access patterns
void optimized_memory_access(uint32_t* data, size_t size) {
// Ensure proper alignment
if ((uintptr_t)data % 16 == 0) {
// Aligned access - use vector operations
vectorized_operations(data, size);
} else {
// Unaligned access - use scalar operations
for (size_t i = 0; i < size; i++) {
data[i] = __builtin_popcount(data[i]);
}
}
}
Cross-platform compatibility ensures that code using intrinsics works across different architectures and compilers while maintaining optimal performance.
Feature Detection:
Abstraction Layers:
// Compile-time feature detection
#ifdef __GNUC__
#define HAS_POPCNT 1
#define POPCNT(x) __builtin_popcount(x)
#elif defined(_MSC_VER)
#define HAS_POPCNT 1
#define POPCNT(x) __popcnt(x)
#else
#define HAS_POPCNT 0
#define POPCNT(x) popcount_fallback(x)
#endif
// Runtime feature detection
bool has_popcnt_instruction(void) {
#ifdef __x86_64__
// Check CPUID for POPCNT support
uint32_t eax, ebx, ecx, edx;
__get_cpuid(1, &eax, &ebx, &ecx, &edx);
return (ecx & (1 << 23)) != 0;
#else
return false;
#endif
}
// Platform-independent interface
typedef struct {
uint32_t (*popcount)(uint32_t);
uint32_t (*clz)(uint32_t);
uint32_t (*ctz)(uint32_t);
} intrinsic_interface_t;
// Platform-specific implementations
#ifdef __GNUC__
static uint32_t gcc_popcount(uint32_t value) {
return __builtin_popcount(value);
}
static uint32_t gcc_clz(uint32_t value) {
return __builtin_clz(value);
}
static uint32_t gcc_ctz(uint32_t value) {
return __builtin_ctz(value);
}
static const intrinsic_interface_t intrinsics = {
.popcount = gcc_popcount,
.clz = gcc_clz,
.ctz = gcc_ctz
};
#else
// Fallback implementations
static const intrinsic_interface_t intrinsics = {
.popcount = popcount_fallback,
.clz = clz_fallback,
.ctz = ctz_fallback
};
#endif
#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
// Platform detection
#ifdef __GNUC__
#define COMPILER_GCC 1
#elif defined(_MSC_VER)
#define COMPILER_MSVC 1
#else
#define COMPILER_UNKNOWN 1
#endif
// Feature detection
#ifdef __ARM_NEON
#define HAS_NEON 1
#include <arm_neon.h>
#else
#define HAS_NEON 0
#endif
// Intrinsic definitions
#ifdef COMPILER_GCC
#define POPCNT(x) __builtin_popcount(x)
#define CLZ(x) __builtin_clz(x)
#define CTZ(x) __builtin_ctz(x)
#define BSWAP32(x) __builtin_bswap32(x)
#elif defined(COMPILER_MSVC)
#define POPCNT(x) __popcnt(x)
#define CLZ(x) __lzcnt(x)
#define CTZ(x) _tzcnt_u32(x)
#define BSWAP32(x) _byteswap_ulong(x)
#else
// Fallback implementations
#define POPCNT(x) popcount_fallback(x)
#define CLZ(x) clz_fallback(x)
#define CTZ(x) ctz_fallback(x)
#define BSWAP32(x) bswap32_fallback(x)
#endif
// Fallback implementations
uint32_t popcount_fallback(uint32_t value) {
uint32_t count = 0;
while (value) {
count += value & 1;
value >>= 1;
}
return count;
}
uint32_t clz_fallback(uint32_t value) {
if (value == 0) return 32;
uint32_t count = 0;
while (!(value & 0x80000000)) {
count++;
value <<= 1;
}
return count;
}
uint32_t ctz_fallback(uint32_t value) {
if (value == 0) return 32;
uint32_t count = 0;
while (!(value & 1)) {
count++;
value >>= 1;
}
return count;
}
uint32_t bswap32_fallback(uint32_t value) {
return ((value & 0xFF000000) >> 24) |
((value & 0x00FF0000) >> 8) |
((value & 0x0000FF00) << 8) |
((value & 0x000000FF) << 24);
}
// ARM-specific intrinsics
#ifdef __arm__
void enable_interrupts_arm(void) {
__builtin_arm_cpsie_i();
}
void disable_interrupts_arm(void) {
__builtin_arm_cpsid_i();
}
void memory_barrier_arm(void) {
__builtin_arm_dmb(0xE);
}
#else
void enable_interrupts_arm(void) {
// Platform-specific implementation
}
void disable_interrupts_arm(void) {
// Platform-specific implementation
}
void memory_barrier_arm(void) {
// Platform-specific implementation
}
#endif
// SIMD operations
#ifdef HAS_NEON
void vector_add_neon(uint32_t* data, size_t size) {
for (size_t i = 0; i < size; i += 4) {
uint32x4_t vec = vld1q_u32(&data[i]);
vec = vaddq_u32(vec, vdupq_n_u32(1));
vst1q_u32(&data[i], vec);
}
}
#else
void vector_add_neon(uint32_t* data, size_t size) {
for (size_t i = 0; i < size; i++) {
data[i] += 1;
}
}
#endif
// Performance testing
void test_intrinsics(void) {
uint32_t test_value = 0x12345678;
printf("Testing intrinsics:\n");
printf("Value: 0x%08X\n", test_value);
printf("Population count: %u\n", POPCNT(test_value));
printf("Leading zeros: %u\n", CLZ(test_value));
printf("Trailing zeros: %u\n", CTZ(test_value));
printf("Byte swapped: 0x%08X\n", BSWAP32(test_value));
}
// Main function
int main(void) {
// Test intrinsics
test_intrinsics();
// Test vector operations
uint32_t data[16] = {0};
for (int i = 0; i < 16; i++) {
data[i] = i;
}
vector_add_neon(data, 16);
printf("Vector operations completed\n");
return 0;
}
Problem: Code not portable across platforms Solution: Use conditional compilation and feature detection
// ❌ Bad: Platform-specific code
uint32_t count_bits(uint32_t value) {
return __builtin_popcount(value); // GCC-specific
}
// ✅ Good: Platform-independent code
uint32_t count_bits(uint32_t value) {
#ifdef __GNUC__
return __builtin_popcount(value);
#elif defined(_MSC_VER)
return __popcnt(value);
#else
return popcount_fallback(value);
#endif
}
Problem: Using intrinsics without checking availability Solution: Implement proper feature detection
// ❌ Bad: No feature detection
void vector_operation(uint32_t* data, size_t size) {
// May fail on platforms without SIMD support
uint32x4_t vec = vld1q_u32(data);
}
// ✅ Good: Feature detection
void vector_operation(uint32_t* data, size_t size) {
#ifdef __ARM_NEON
uint32x4_t vec = vld1q_u32(data);
// NEON operations
#else
// Fallback implementation
for (size_t i = 0; i < size; i++) {
data[i] += 1;
}
#endif
}
Problem: Incorrect intrinsic usage Solution: Read documentation and test thoroughly
// ❌ Bad: Incorrect intrinsic usage
uint32_t count_bits(uint32_t value) {
return __builtin_popcount(&value); // Wrong: passing pointer
}
// ✅ Good: Correct intrinsic usage
uint32_t count_bits(uint32_t value) {
return __builtin_popcount(value); // Correct: passing value
}
Problem: Assuming intrinsics are always faster Solution: Profile and measure performance
// ❌ Bad: Assuming intrinsics are always faster
uint32_t count_bits(uint32_t value) {
return __builtin_popcount(value); // May not be faster for small values
}
// ✅ Good: Profile and choose appropriately
uint32_t count_bits(uint32_t value) {
if (value == 0) return 0;
if (value == 0xFFFFFFFF) return 32;
// Use intrinsic for non-trivial cases
return __builtin_popcount(value);
}
Next Steps: Explore Assembly Integration to understand low-level programming techniques, or dive into Memory Models for understanding memory layout.