1 #ifndef JEMALLOC_INTERNAL_TSD_H
2 #define JEMALLOC_INTERNAL_TSD_H
3
4 #include "jemalloc/internal/activity_callback.h"
5 #include "jemalloc/internal/arena_types.h"
6 #include "jemalloc/internal/assert.h"
7 #include "jemalloc/internal/bin_types.h"
8 #include "jemalloc/internal/jemalloc_internal_externs.h"
9 #include "jemalloc/internal/peak.h"
10 #include "jemalloc/internal/prof_types.h"
11 #include "jemalloc/internal/ql.h"
12 #include "jemalloc/internal/rtree_tsd.h"
13 #include "jemalloc/internal/tcache_types.h"
14 #include "jemalloc/internal/tcache_structs.h"
15 #include "jemalloc/internal/util.h"
16 #include "jemalloc/internal/witness.h"
17
18 /*
19 * Thread-Specific-Data layout
20 *
21 * At least some thread-local data gets touched on the fast-path of almost all
22 * malloc operations. But much of it is only necessary down slow-paths, or
23 * testing. We want to colocate the fast-path data so that it can live on the
24 * same cacheline if possible. So we define three tiers of hotness:
25 * TSD_DATA_FAST: Touched on the alloc/dalloc fast paths.
26 * TSD_DATA_SLOW: Touched down slow paths. "Slow" here is sort of general;
27 * there are "semi-slow" paths like "not a sized deallocation, but can still
28 * live in the tcache". We'll want to keep these closer to the fast-path
29 * data.
30 * TSD_DATA_SLOWER: Only touched in test or debug modes, or not touched at all.
31 *
32 * An additional concern is that the larger tcache bins won't be used (we have a
33 * bin per size class, but by default only cache relatively small objects). So
34 * the earlier bins are in the TSD_DATA_FAST tier, but the later ones are in the
35 * TSD_DATA_SLOWER tier.
36 *
37 * As a result of all this, we put the slow data first, then the fast data, then
38 * the slower data, while keeping the tcache as the last element of the fast
39 * data (so that the fast -> slower transition happens midway through the
40 * tcache). While we don't yet play alignment tricks to guarantee it, this
41 * increases our odds of getting some cache/page locality on fast paths.
42 */
43
44 #ifdef JEMALLOC_JET
45 typedef void (*test_callback_t)(int *);
46 # define MALLOC_TSD_TEST_DATA_INIT 0x72b65c10
47 # define MALLOC_TEST_TSD \
48 O(test_data, int, int) \
49 O(test_callback, test_callback_t, int)
50 # define MALLOC_TEST_TSD_INITIALIZER , MALLOC_TSD_TEST_DATA_INIT, NULL
51 #else
52 # define MALLOC_TEST_TSD
53 # define MALLOC_TEST_TSD_INITIALIZER
54 #endif
55
56 typedef ql_elm(tsd_t) tsd_link_t;
57
58 /* O(name, type, nullable type) */
59 #define TSD_DATA_SLOW \
60 O(tcache_enabled, bool, bool) \
61 O(reentrancy_level, int8_t, int8_t) \
62 O(thread_allocated_last_event, uint64_t, uint64_t) \
63 O(thread_allocated_next_event, uint64_t, uint64_t) \
64 O(thread_deallocated_last_event, uint64_t, uint64_t) \
65 O(thread_deallocated_next_event, uint64_t, uint64_t) \
66 O(tcache_gc_event_wait, uint64_t, uint64_t) \
67 O(tcache_gc_dalloc_event_wait, uint64_t, uint64_t) \
68 O(prof_sample_event_wait, uint64_t, uint64_t) \
69 O(prof_sample_last_event, uint64_t, uint64_t) \
70 O(stats_interval_event_wait, uint64_t, uint64_t) \
71 O(stats_interval_last_event, uint64_t, uint64_t) \
72 O(peak_alloc_event_wait, uint64_t, uint64_t) \
73 O(peak_dalloc_event_wait, uint64_t, uint64_t) \
74 O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \
75 O(prng_state, uint64_t, uint64_t) \
76 O(san_extents_until_guard_small, uint64_t, uint64_t) \
77 O(san_extents_until_guard_large, uint64_t, uint64_t) \
78 O(iarena, arena_t *, arena_t *) \
79 O(arena, arena_t *, arena_t *) \
80 O(arena_decay_ticker, ticker_geom_t, ticker_geom_t) \
81 O(sec_shard, uint8_t, uint8_t) \
82 O(binshards, tsd_binshards_t, tsd_binshards_t)\
83 O(tsd_link, tsd_link_t, tsd_link_t) \
84 O(in_hook, bool, bool) \
85 O(peak, peak_t, peak_t) \
86 O(activity_callback_thunk, activity_callback_thunk_t, \
87 activity_callback_thunk_t) \
88 O(tcache_slow, tcache_slow_t, tcache_slow_t) \
89 O(rtree_ctx, rtree_ctx_t, rtree_ctx_t)
90
91 #define TSD_DATA_SLOW_INITIALIZER \
92 /* tcache_enabled */ TCACHE_ENABLED_ZERO_INITIALIZER, \
93 /* reentrancy_level */ 0, \
94 /* thread_allocated_last_event */ 0, \
95 /* thread_allocated_next_event */ 0, \
96 /* thread_deallocated_last_event */ 0, \
97 /* thread_deallocated_next_event */ 0, \
98 /* tcache_gc_event_wait */ 0, \
99 /* tcache_gc_dalloc_event_wait */ 0, \
100 /* prof_sample_event_wait */ 0, \
101 /* prof_sample_last_event */ 0, \
102 /* stats_interval_event_wait */ 0, \
103 /* stats_interval_last_event */ 0, \
104 /* peak_alloc_event_wait */ 0, \
105 /* peak_dalloc_event_wait */ 0, \
106 /* prof_tdata */ NULL, \
107 /* prng_state */ 0, \
108 /* san_extents_until_guard_small */ 0, \
109 /* san_extents_until_guard_large */ 0, \
110 /* iarena */ NULL, \
111 /* arena */ NULL, \
112 /* arena_decay_ticker */ \
113 TICKER_GEOM_INIT(ARENA_DECAY_NTICKS_PER_UPDATE), \
114 /* sec_shard */ (uint8_t)-1, \
115 /* binshards */ TSD_BINSHARDS_ZERO_INITIALIZER, \
116 /* tsd_link */ {NULL}, \
117 /* in_hook */ false, \
118 /* peak */ PEAK_INITIALIZER, \
119 /* activity_callback_thunk */ \
120 ACTIVITY_CALLBACK_THUNK_INITIALIZER, \
121 /* tcache_slow */ TCACHE_SLOW_ZERO_INITIALIZER, \
122 /* rtree_ctx */ RTREE_CTX_INITIALIZER,
123
124 /* O(name, type, nullable type) */
125 #define TSD_DATA_FAST \
126 O(thread_allocated, uint64_t, uint64_t) \
127 O(thread_allocated_next_event_fast, uint64_t, uint64_t) \
128 O(thread_deallocated, uint64_t, uint64_t) \
129 O(thread_deallocated_next_event_fast, uint64_t, uint64_t) \
130 O(tcache, tcache_t, tcache_t)
131
132 #define TSD_DATA_FAST_INITIALIZER \
133 /* thread_allocated */ 0, \
134 /* thread_allocated_next_event_fast */ 0, \
135 /* thread_deallocated */ 0, \
136 /* thread_deallocated_next_event_fast */ 0, \
137 /* tcache */ TCACHE_ZERO_INITIALIZER,
138
139 /* O(name, type, nullable type) */
140 #define TSD_DATA_SLOWER \
141 O(witness_tsd, witness_tsd_t, witness_tsdn_t) \
142 MALLOC_TEST_TSD
143
144 #define TSD_DATA_SLOWER_INITIALIZER \
145 /* witness */ WITNESS_TSD_INITIALIZER \
146 /* test data */ MALLOC_TEST_TSD_INITIALIZER
147
148
149 #define TSD_INITIALIZER { \
150 TSD_DATA_SLOW_INITIALIZER \
151 /* state */ ATOMIC_INIT(tsd_state_uninitialized), \
152 TSD_DATA_FAST_INITIALIZER \
153 TSD_DATA_SLOWER_INITIALIZER \
154 }
155
156 #if defined(JEMALLOC_MALLOC_THREAD_CLEANUP) || defined(_WIN32)
157 void _malloc_tsd_cleanup_register(bool (*f)(void));
158 #endif
159
160 void *malloc_tsd_malloc(size_t size);
161 void malloc_tsd_dalloc(void *wrapper);
162 tsd_t *malloc_tsd_boot0(void);
163 void malloc_tsd_boot1(void);
164 void tsd_cleanup(void *arg);
165 tsd_t *tsd_fetch_slow(tsd_t *tsd, bool internal);
166 void tsd_state_set(tsd_t *tsd, uint8_t new_state);
167 void tsd_slow_update(tsd_t *tsd);
168 void tsd_prefork(tsd_t *tsd);
169 void tsd_postfork_parent(tsd_t *tsd);
170 void tsd_postfork_child(tsd_t *tsd);
171
172 /*
173 * Call ..._inc when your module wants to take all threads down the slow paths,
174 * and ..._dec when it no longer needs to.
175 */
176 void tsd_global_slow_inc(tsdn_t *tsdn);
177 void tsd_global_slow_dec(tsdn_t *tsdn);
178 bool tsd_global_slow();
179
180 enum {
181 /* Common case --> jnz. */
182 tsd_state_nominal = 0,
183 /* Initialized but on slow path. */
184 tsd_state_nominal_slow = 1,
185 /*
186 * Some thread has changed global state in such a way that all nominal
187 * threads need to recompute their fast / slow status the next time they
188 * get a chance.
189 *
190 * Any thread can change another thread's status *to* recompute, but
191 * threads are the only ones who can change their status *from*
192 * recompute.
193 */
194 tsd_state_nominal_recompute = 2,
195 /*
196 * The above nominal states should be lower values. We use
197 * tsd_nominal_max to separate nominal states from threads in the
198 * process of being born / dying.
199 */
200 tsd_state_nominal_max = 2,
201
202 /*
203 * A thread might free() during its death as its only allocator action;
204 * in such scenarios, we need tsd, but set up in such a way that no
205 * cleanup is necessary.
206 */
207 tsd_state_minimal_initialized = 3,
208 /* States during which we know we're in thread death. */
209 tsd_state_purgatory = 4,
210 tsd_state_reincarnated = 5,
211 /*
212 * What it says on the tin; tsd that hasn't been initialized. Note
213 * that even when the tsd struct lives in TLS, when need to keep track
214 * of stuff like whether or not our pthread destructors have been
215 * scheduled, so this really truly is different than the nominal state.
216 */
217 tsd_state_uninitialized = 6
218 };
219
220 /*
221 * Some TSD accesses can only be done in a nominal state. To enforce this, we
222 * wrap TSD member access in a function that asserts on TSD state, and mangle
223 * field names to prevent touching them accidentally.
224 */
225 #define TSD_MANGLE(n) cant_access_tsd_items_directly_use_a_getter_or_setter_##n
226
227 #ifdef JEMALLOC_U8_ATOMICS
228 # define tsd_state_t atomic_u8_t
229 # define tsd_atomic_load atomic_load_u8
230 # define tsd_atomic_store atomic_store_u8
231 # define tsd_atomic_exchange atomic_exchange_u8
232 #else
233 # define tsd_state_t atomic_u32_t
234 # define tsd_atomic_load atomic_load_u32
235 # define tsd_atomic_store atomic_store_u32
236 # define tsd_atomic_exchange atomic_exchange_u32
237 #endif
238
239 /* The actual tsd. */
240 struct tsd_s {
241 /*
242 * The contents should be treated as totally opaque outside the tsd
243 * module. Access any thread-local state through the getters and
244 * setters below.
245 */
246
247 #define O(n, t, nt) \
248 t TSD_MANGLE(n);
249
250 TSD_DATA_SLOW
251 /*
252 * We manually limit the state to just a single byte. Unless the 8-bit
253 * atomics are unavailable (which is rare).
254 */
255 tsd_state_t state;
256 TSD_DATA_FAST
257 TSD_DATA_SLOWER
258 #undef O
259 /* AddressSanitizer requires TLS data to be aligned to at least 8 bytes. */
260 } JEMALLOC_ALIGNED(16);
261
262 JEMALLOC_ALWAYS_INLINE uint8_t
tsd_state_get(tsd_t * tsd)263 tsd_state_get(tsd_t *tsd) {
264 /*
265 * This should be atomic. Unfortunately, compilers right now can't tell
266 * that this can be done as a memory comparison, and forces a load into
267 * a register that hurts fast-path performance.
268 */
269 /* return atomic_load_u8(&tsd->state, ATOMIC_RELAXED); */
270 return *(uint8_t *)&tsd->state;
271 }
272
273 /*
274 * Wrapper around tsd_t that makes it possible to avoid implicit conversion
275 * between tsd_t and tsdn_t, where tsdn_t is "nullable" and has to be
276 * explicitly converted to tsd_t, which is non-nullable.
277 */
278 struct tsdn_s {
279 tsd_t tsd;
280 };
281 #define TSDN_NULL ((tsdn_t *)0)
282 JEMALLOC_ALWAYS_INLINE tsdn_t *
tsd_tsdn(tsd_t * tsd)283 tsd_tsdn(tsd_t *tsd) {
284 return (tsdn_t *)tsd;
285 }
286
287 JEMALLOC_ALWAYS_INLINE bool
tsdn_null(const tsdn_t * tsdn)288 tsdn_null(const tsdn_t *tsdn) {
289 return tsdn == NULL;
290 }
291
292 JEMALLOC_ALWAYS_INLINE tsd_t *
tsdn_tsd(tsdn_t * tsdn)293 tsdn_tsd(tsdn_t *tsdn) {
294 assert(!tsdn_null(tsdn));
295
296 return &tsdn->tsd;
297 }
298
299 /*
300 * We put the platform-specific data declarations and inlines into their own
301 * header files to avoid cluttering this file. They define tsd_boot0,
302 * tsd_boot1, tsd_boot, tsd_booted_get, tsd_get_allocates, tsd_get, and tsd_set.
303 */
304 #ifdef JEMALLOC_MALLOC_THREAD_CLEANUP
305 #include "jemalloc/internal/tsd_malloc_thread_cleanup.h"
306 #elif (defined(JEMALLOC_TLS))
307 #include "jemalloc/internal/tsd_tls.h"
308 #elif (defined(_WIN32))
309 #include "jemalloc/internal/tsd_win.h"
310 #else
311 #include "jemalloc/internal/tsd_generic.h"
312 #endif
313
314 /*
315 * tsd_foop_get_unsafe(tsd) returns a pointer to the thread-local instance of
316 * foo. This omits some safety checks, and so can be used during tsd
317 * initialization and cleanup.
318 */
319 #define O(n, t, nt) \
320 JEMALLOC_ALWAYS_INLINE t * \
321 tsd_##n##p_get_unsafe(tsd_t *tsd) { \
322 return &tsd->TSD_MANGLE(n); \
323 }
324 TSD_DATA_SLOW
325 TSD_DATA_FAST
326 TSD_DATA_SLOWER
327 #undef O
328
329 /* tsd_foop_get(tsd) returns a pointer to the thread-local instance of foo. */
330 #define O(n, t, nt) \
331 JEMALLOC_ALWAYS_INLINE t * \
332 tsd_##n##p_get(tsd_t *tsd) { \
333 /* \
334 * Because the state might change asynchronously if it's \
335 * nominal, we need to make sure that we only read it once. \
336 */ \
337 uint8_t state = tsd_state_get(tsd); \
338 assert(state == tsd_state_nominal || \
339 state == tsd_state_nominal_slow || \
340 state == tsd_state_nominal_recompute || \
341 state == tsd_state_reincarnated || \
342 state == tsd_state_minimal_initialized); \
343 return tsd_##n##p_get_unsafe(tsd); \
344 }
345 TSD_DATA_SLOW
346 TSD_DATA_FAST
347 TSD_DATA_SLOWER
348 #undef O
349
350 /*
351 * tsdn_foop_get(tsdn) returns either the thread-local instance of foo (if tsdn
352 * isn't NULL), or NULL (if tsdn is NULL), cast to the nullable pointer type.
353 */
354 #define O(n, t, nt) \
355 JEMALLOC_ALWAYS_INLINE nt * \
356 tsdn_##n##p_get(tsdn_t *tsdn) { \
357 if (tsdn_null(tsdn)) { \
358 return NULL; \
359 } \
360 tsd_t *tsd = tsdn_tsd(tsdn); \
361 return (nt *)tsd_##n##p_get(tsd); \
362 }
363 TSD_DATA_SLOW
364 TSD_DATA_FAST
365 TSD_DATA_SLOWER
366 #undef O
367
368 /* tsd_foo_get(tsd) returns the value of the thread-local instance of foo. */
369 #define O(n, t, nt) \
370 JEMALLOC_ALWAYS_INLINE t \
371 tsd_##n##_get(tsd_t *tsd) { \
372 return *tsd_##n##p_get(tsd); \
373 }
374 TSD_DATA_SLOW
375 TSD_DATA_FAST
376 TSD_DATA_SLOWER
377 #undef O
378
379 /* tsd_foo_set(tsd, val) updates the thread-local instance of foo to be val. */
380 #define O(n, t, nt) \
381 JEMALLOC_ALWAYS_INLINE void \
382 tsd_##n##_set(tsd_t *tsd, t val) { \
383 assert(tsd_state_get(tsd) != tsd_state_reincarnated && \
384 tsd_state_get(tsd) != tsd_state_minimal_initialized); \
385 *tsd_##n##p_get(tsd) = val; \
386 }
387 TSD_DATA_SLOW
388 TSD_DATA_FAST
389 TSD_DATA_SLOWER
390 #undef O
391
392 JEMALLOC_ALWAYS_INLINE void
tsd_assert_fast(tsd_t * tsd)393 tsd_assert_fast(tsd_t *tsd) {
394 /*
395 * Note that our fastness assertion does *not* include global slowness
396 * counters; it's not in general possible to ensure that they won't
397 * change asynchronously from underneath us.
398 */
399 assert(!malloc_slow && tsd_tcache_enabled_get(tsd) &&
400 tsd_reentrancy_level_get(tsd) == 0);
401 }
402
403 JEMALLOC_ALWAYS_INLINE bool
tsd_fast(tsd_t * tsd)404 tsd_fast(tsd_t *tsd) {
405 bool fast = (tsd_state_get(tsd) == tsd_state_nominal);
406 if (fast) {
407 tsd_assert_fast(tsd);
408 }
409
410 return fast;
411 }
412
413 JEMALLOC_ALWAYS_INLINE tsd_t *
tsd_fetch_impl(bool init,bool minimal)414 tsd_fetch_impl(bool init, bool minimal) {
415 tsd_t *tsd = tsd_get(init);
416
417 if (!init && tsd_get_allocates() && tsd == NULL) {
418 return NULL;
419 }
420 assert(tsd != NULL);
421
422 if (unlikely(tsd_state_get(tsd) != tsd_state_nominal)) {
423 return tsd_fetch_slow(tsd, minimal);
424 }
425 assert(tsd_fast(tsd));
426 tsd_assert_fast(tsd);
427
428 return tsd;
429 }
430
431 /* Get a minimal TSD that requires no cleanup. See comments in free(). */
432 JEMALLOC_ALWAYS_INLINE tsd_t *
tsd_fetch_min(void)433 tsd_fetch_min(void) {
434 return tsd_fetch_impl(true, true);
435 }
436
437 /* For internal background threads use only. */
438 JEMALLOC_ALWAYS_INLINE tsd_t *
tsd_internal_fetch(void)439 tsd_internal_fetch(void) {
440 tsd_t *tsd = tsd_fetch_min();
441 /* Use reincarnated state to prevent full initialization. */
442 tsd_state_set(tsd, tsd_state_reincarnated);
443
444 return tsd;
445 }
446
447 JEMALLOC_ALWAYS_INLINE tsd_t *
tsd_fetch(void)448 tsd_fetch(void) {
449 return tsd_fetch_impl(true, false);
450 }
451
452 static inline bool
tsd_nominal(tsd_t * tsd)453 tsd_nominal(tsd_t *tsd) {
454 bool nominal = tsd_state_get(tsd) <= tsd_state_nominal_max;
455 assert(nominal || tsd_reentrancy_level_get(tsd) > 0);
456
457 return nominal;
458 }
459
460 JEMALLOC_ALWAYS_INLINE tsdn_t *
tsdn_fetch(void)461 tsdn_fetch(void) {
462 if (!tsd_booted_get()) {
463 return NULL;
464 }
465
466 return tsd_tsdn(tsd_fetch_impl(false, false));
467 }
468
469 JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
tsd_rtree_ctx(tsd_t * tsd)470 tsd_rtree_ctx(tsd_t *tsd) {
471 return tsd_rtree_ctxp_get(tsd);
472 }
473
474 JEMALLOC_ALWAYS_INLINE rtree_ctx_t *
tsdn_rtree_ctx(tsdn_t * tsdn,rtree_ctx_t * fallback)475 tsdn_rtree_ctx(tsdn_t *tsdn, rtree_ctx_t *fallback) {
476 /*
477 * If tsd cannot be accessed, initialize the fallback rtree_ctx and
478 * return a pointer to it.
479 */
480 if (unlikely(tsdn_null(tsdn))) {
481 rtree_ctx_data_init(fallback);
482 return fallback;
483 }
484 return tsd_rtree_ctx(tsdn_tsd(tsdn));
485 }
486
487 static inline bool
tsd_state_nocleanup(tsd_t * tsd)488 tsd_state_nocleanup(tsd_t *tsd) {
489 return tsd_state_get(tsd) == tsd_state_reincarnated ||
490 tsd_state_get(tsd) == tsd_state_minimal_initialized;
491 }
492
493 /*
494 * These "raw" tsd reentrancy functions don't have any debug checking to make
495 * sure that we're not touching arena 0. Better is to call pre_reentrancy and
496 * post_reentrancy if this is possible.
497 */
498 static inline void
tsd_pre_reentrancy_raw(tsd_t * tsd)499 tsd_pre_reentrancy_raw(tsd_t *tsd) {
500 bool fast = tsd_fast(tsd);
501 assert(tsd_reentrancy_level_get(tsd) < INT8_MAX);
502 ++*tsd_reentrancy_levelp_get(tsd);
503 if (fast) {
504 /* Prepare slow path for reentrancy. */
505 tsd_slow_update(tsd);
506 assert(tsd_state_get(tsd) == tsd_state_nominal_slow);
507 }
508 }
509
510 static inline void
tsd_post_reentrancy_raw(tsd_t * tsd)511 tsd_post_reentrancy_raw(tsd_t *tsd) {
512 int8_t *reentrancy_level = tsd_reentrancy_levelp_get(tsd);
513 assert(*reentrancy_level > 0);
514 if (--*reentrancy_level == 0) {
515 tsd_slow_update(tsd);
516 }
517 }
518
519 #endif /* JEMALLOC_INTERNAL_TSD_H */
520