CX Framework
Cross-platform C utility framework
Loading...
Searching...
No Matches
simd.h
1#pragma once
2
3#include <cx/platform/base.h>
4
5#if defined(_ARCH_X86) || defined(_ARCH_X64)
6
7// All three compilers support emmintrin.h for SSE2
8#include <emmintrin.h>
9#define _SIMD 1
10#define _SIMD_SSE2 1
11
12typedef __m128i simd128;
13
14#define CX_SIMD_SSE2 1
15
16_meta_inline simd128 simd128Set1_u8(uint8 val)
17{
18 return _mm_set1_epi8((char)val);
19}
20
21_meta_inline simd128 simd128Load(const void *ptr)
22{
23 return _mm_loadu_si128((const __m128i*)ptr);
24}
25
26_meta_inline uint32 simd128CmpEq_u8_mask(simd128 a, simd128 b)
27{
28 return (uint32)_mm_movemask_epi8(_mm_cmpeq_epi8(a, b));
29}
30
31#elif defined(_ARCH_ARM64)
32
33#include <arm_neon.h>
34#define _SIMD 1
35#define _SIMD_NEON 1
36
37typedef uint8x16_t simd128;
38
39_meta_inline simd128 simd128Set1_u8(uint8 val)
40{
41 return vdupq_n_u8(val);
42}
43
44_meta_inline simd128 simd128Load(const void *ptr)
45{
46 return vld1q_u8((const uint8_t*)ptr);
47}
48
49_meta_inline uint32 simd128CmpEq_u8_mask(simd128 a, simd128 b)
50{
51 // ARM NEON workaround - expensive!
52 uint8x16_t cmp = vceqq_u8(a, b);
53
54 // Extract each byte's high bit and pack into 16-bit result
55 static const uint8_t bit_positions[16] = {
56 1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7,
57 1<<0, 1<<1, 1<<2, 1<<3, 1<<4, 1<<5, 1<<6, 1<<7
58 };
59
60 uint8x16_t bits = vandq_u8(cmp, vld1q_u8(bit_positions));
61 uint8x8_t low = vget_low_u8(bits);
62 uint8x8_t high = vget_high_u8(bits);
63
64 return vaddv_u8(low) | ((uint32)vaddv_u8(high) << 8);
65}
66
67#endif
Compiler and platform detection macros.