CX Framework
Cross-platform C utility framework
Loading...
Searching...
No Matches
aspin.h
1#pragma once
2
3#include <cx/math/lcg.h>
4#include <cx/thread/atomic.h>
5#include <cx/time/clock.h>
6#include <cx/time/time.h>
7#include <cx/platform/cpu.h>
8#include <cx/platform/os.h>
9#include <cx/utils/compare.h>
10
11// Common functions for adaptive spin threading primitives
12
13#define ASPIN_MAX_USEC 10 // hard cap in microseconds
14#define ASPIN_INITIAL_TARGET 10 // initial number of cycles to spin
15#define ASPIN_NOSPIN (-2147483647 - 1)
16
17#if DEBUG_LEVEL >= 2
18#define ASPIN_PERF_STATS
19#endif
20
21typedef struct AdaptiveSpin
22{
23 atomic(int32) spintarget;
24#ifdef ASPIN_PERF_STATS
25 atomic(intptr) stats_uncontended; // uncontended acquisition of primitive
26 atomic(intptr) stats_spin; // times primitive was acquired after spinning
27 atomic(intptr) stats_futex; // times primitive was acquired on non-spin (futex) path
28 atomic(intptr) stats_capped; // times spin loop was capped by ASPIN_MAX_USEC
29 atomic(intptr) stats_timeout; // times the primitive failed due to timeout
30 atomic(intptr) stats_yield; // times the primitive yielded CPU to reduce contention
31#endif
32} AdaptiveSpin;
33
34typedef struct AdaptiveSpinState
35{
36 int64 now;
37 int64 start;
38 int64 endtime;
39 int64 spincap;
40 int32 curtarget;
41 int32 spincount;
42 int32 contention;
43 uint32 rstate;
44} AdaptiveSpinState;
45
46_meta_inline void aspinRecordUncontended(_Inout_ AdaptiveSpin *aspin)
47{
48#ifdef ASPIN_PERF_STATS
49 atomicFetchAdd(intptr, &aspin->stats_uncontended, 1, Relaxed);
50#else
51 (void)aspin;
52#endif
53}
54
55_meta_inline void aspinRecordSpin(_Inout_ AdaptiveSpin *aspin)
56{
57#ifdef ASPIN_PERF_STATS
58 atomicFetchAdd(intptr, &aspin->stats_spin, 1, Relaxed);
59#else
60 (void)aspin;
61#endif
62}
63
64_meta_inline void aspinRecordFutex(_Inout_ AdaptiveSpin *aspin)
65{
66#ifdef ASPIN_PERF_STATS
67 atomicFetchAdd(intptr, &aspin->stats_futex, 1, Relaxed);
68#else
69 (void)aspin;
70#endif
71}
72
73_meta_inline void aspinRecordCapped(_Inout_ AdaptiveSpin *aspin)
74{
75#ifdef ASPIN_PERF_STATS
76 atomicFetchAdd(intptr, &aspin->stats_capped, 1, Relaxed);
77#else
78 (void)aspin;
79#endif
80}
81
82_meta_inline void aspinRecordTimeout(_Inout_ AdaptiveSpin *aspin)
83{
84#ifdef ASPIN_PERF_STATS
85 atomicFetchAdd(intptr, &aspin->stats_timeout, 1, Relaxed);
86#else
87 (void)aspin;
88#endif
89}
90
91_meta_inline void aspinRecordYield(_Inout_ AdaptiveSpin *aspin)
92{
93#ifdef ASPIN_PERF_STATS
94 atomicFetchAdd(intptr, &aspin->stats_yield, 1, Relaxed);
95#else
96 (void)aspin;
97#endif
98}
99
100_meta_inline void aspinInit(_Out_ AdaptiveSpin *aspin, bool nospin)
101{
102 memset(aspin, 0, sizeof(AdaptiveSpin));
103
104 // never spin if there's only a single core, just a waste of CPU
105 if (osPhysicalCPUs() == 1)
106 nospin = true;
107
108 atomicStore(int32, &aspin->spintarget, nospin ? ASPIN_NOSPIN : ASPIN_INITIAL_TARGET, Relaxed);
109}
110
111_meta_inline void aspinBegin(_Inout_ AdaptiveSpin *aspin, _Out_ AdaptiveSpinState *ass, int64 timeout)
112{
113 ass->now = clockTimer();
114 ass->start = ass->now;
115 ass->spincap = ass->start + ASPIN_MAX_USEC;
116 ass->endtime = (timeout == timeForever) ? timeForever : ass->start + timeout;
117 ass->curtarget = atomicLoad(int32, &aspin->spintarget, Relaxed);
118 if (ass->curtarget < 1 && ass->curtarget != ASPIN_NOSPIN)
119 ass->curtarget = ASPIN_INITIAL_TARGET;
120 ass->spincount = (ass->curtarget == ASPIN_NOSPIN) ? 0 : ass->curtarget * 2; // -1 == nospin
121 ass->contention = 0;
122 ass->rstate = (ass->now & 0xffffffff);
123}
124
125_meta_inline bool aspinSpin(_Inout_ AdaptiveSpin *aspin, _Inout_ AdaptiveSpinState *ass)
126{
127 // clockTimer may be an expensive system call on some platforms.
128 // Only update clock once every 8 loops, for a tighter spin and less latency.
129 if ((ass->spincount & 7) == 7)
130 ass->now = clockTimer();
131
132 // check if we hit the hard cap on spin time
133 if (ass->spincount > 0 && ass->now > ass->spincap) {
134 // this sets the target to half the current spincount, since the cap will be double that
135 atomicStore(int32, &aspin->spintarget, (ass->curtarget * 2 - ass->spincount) / 2, Relaxed);
136 ass->spincount = 0;
137 aspinRecordCapped(aspin);
138 }
139
140 if (ass->spincount > 0) {
141 --ass->spincount;
142 _CPU_PAUSE;
143 return true;
144 }
145
146 return false;
147}
148
149_meta_inline bool aspinTimeout(_Inout_ AdaptiveSpin *aspin, _Inout_ AdaptiveSpinState *ass)
150{
151 // early out if we don't have a timeout -- skip clock update
152 if (ass->endtime == timeForever)
153 return false;
154
155 // if spinloop is done, update clock here instead since the futex wait
156 // could be a very long time
157 if (ass->spincount == 0)
158 ass->now = clockTimer();
159
160 if (ass->now > ass->endtime) {
161 aspinRecordTimeout(aspin);
162 return true;
163 }
164 return false;
165}
166
167_meta_inline void aspinAdapt(_Inout_ AdaptiveSpin *aspin, _Inout_ AdaptiveSpinState *ass)
168{
169 // don't adapt if we timed out entirely
170 if (ass->now > ass->endtime)
171 return;
172
173 if (ass->curtarget != ASPIN_NOSPIN) {
174 if (ass->spincount > 0) {
175 // adjust adaptive target based on how much spinning we did
176 int32 realtarget = atomicLoad(int32, &aspin->spintarget, Relaxed);
177 atomicFetchAdd(int32, &aspin->spintarget, clamp(ass->curtarget - ass->spincount, -realtarget, realtarget) / 8 + 1, Relaxed);
178 aspinRecordSpin(aspin);
179 } else {
180 if (ass->now <= ass->spincap) {
181 // we had to go to futuxes, give it a boost
182 atomicFetchAdd(int32, &aspin->spintarget, ass->curtarget / 8 + 1, Relaxed);
183 }
184 aspinRecordFutex(aspin);
185 }
186 } else {
187 aspinRecordFutex(aspin);
188 }
189}
190
191_meta_inline int64 aspinTimeoutRemaining(_In_ AdaptiveSpinState *ass)
192{
193 return (ass->endtime == timeForever) ? timeForever : ass->endtime - ass->now;
194}
195
196// call this function when there is contention on a CAS
197_meta_inline void aspinHandleContention(_Inout_opt_ AdaptiveSpin *aspin, _Inout_ AdaptiveSpinState *ass)
198{
199 // This algorithm is cruicial to maintaining high performance even under extreme contention.
200 // Earlier versions of these primitives started to suffer degradation when many concurrent
201 // threads attempted to get the mutex/etc simultaneously. They would spend a lot of time
202 // spinning on a CAS on the underlying atomic, attempting to get into a state where they
203 // could wait with a futex/semaphore.
204
205 // By calling this function after a failed CAS, a count of failures is maintained. As the
206 // count increases, so does the probability that the thread will yield its quantum, reducing
207 // contention enough for another thread to succeed on the CAS and break the cycle.
208
209 // We use a poor quality but fast LCG pseudorandom number generator that is good enough
210 // for this purpose.
211
212 if (lcgRandom(&ass->rstate) % (++ass->contention + 7) > 8) {
213 if (aspin)
214 aspinRecordYield(aspin);
215 osYield();
216 } else {
217 for(int i = ass->contention * ass->contention; i >= 0; --i) {
218 _CPU_PAUSE;
219 }
220 }
221}
222
223_meta_inline void aspinEndContention(_Inout_ AdaptiveSpinState *ass)
224{
225 ass->contention = 0;
226}
System clock functions.
Comparison and clamping macros.
CPU-specific operations and atomic primitives.
int32 lcgRandom(uint32 *state)
Definition lcg.h:68
int osPhysicalCPUs()
void osYield()
int64 clockTimer()
#define timeForever
Maximum representable time value (approximately year 294,276 CE)
Definition time.h:13
Simple linear congruential pseudo-random number generator.
Operating system services.
Time manipulation and conversion functions.