Skip to content

Commit 2e95c5b

Browse files
authored
gh-115999: Implement thread-local bytecode and enable specialization for BINARY_OP (#123926)
Each thread specializes a thread-local copy of the bytecode, created on the first RESUME, in free-threaded builds. All copies of the bytecode for a code object are stored in the co_tlbc array on the code object. Threads reserve a globally unique index identifying its copy of the bytecode in all co_tlbc arrays at thread creation and release the index at thread destruction. The first entry in every co_tlbc array always points to the "main" copy of the bytecode that is stored at the end of the code object. This ensures that no bytecode is copied for programs that do not use threads. Thread-local bytecode can be disabled at runtime by providing either -X tlbc=0 or PYTHON_TLBC=0. Disabling thread-local bytecode also disables specialization. Concurrent modifications to the bytecode made by the specializing interpreter and instrumentation use atomics, with specialization taking care not to overwrite an instruction that was instrumented concurrently.
1 parent e5a4b40 commit 2e95c5b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1509
-254
lines changed

Include/cpython/code.h

+19
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,24 @@ typedef struct {
7272
uint8_t *per_instruction_tools;
7373
} _PyCoMonitoringData;
7474

75+
#ifdef Py_GIL_DISABLED
76+
77+
/* Each thread specializes a thread-local copy of the bytecode in free-threaded
78+
* builds. These copies are stored on the code object in a `_PyCodeArray`. The
79+
* first entry in the array always points to the "main" copy of the bytecode
80+
* that is stored at the end of the code object.
81+
*/
82+
typedef struct {
83+
Py_ssize_t size;
84+
char *entries[1];
85+
} _PyCodeArray;
86+
87+
#define _PyCode_DEF_THREAD_LOCAL_BYTECODE() \
88+
_PyCodeArray *co_tlbc;
89+
#else
90+
#define _PyCode_DEF_THREAD_LOCAL_BYTECODE()
91+
#endif
92+
7593
// To avoid repeating ourselves in deepfreeze.py, all PyCodeObject members are
7694
// defined in this macro:
7795
#define _PyCode_DEF(SIZE) { \
@@ -138,6 +156,7 @@ typedef struct {
138156
Type is a void* to keep the format private in codeobject.c to force \
139157
people to go through the proper APIs. */ \
140158
void *co_extra; \
159+
_PyCode_DEF_THREAD_LOCAL_BYTECODE() \
141160
char co_code_adaptive[(SIZE)]; \
142161
}
143162

Include/cpython/initconfig.h

+1
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ typedef struct PyConfig {
183183
int cpu_count;
184184
#ifdef Py_GIL_DISABLED
185185
int enable_gil;
186+
int tlbc_enabled;
186187
#endif
187188

188189
/* --- Path configuration inputs ------------ */

Include/internal/pycore_ceval.h

+12
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,18 @@ _PyEval_IsGILEnabled(PyThreadState *tstate)
174174
extern int _PyEval_EnableGILTransient(PyThreadState *tstate);
175175
extern int _PyEval_EnableGILPermanent(PyThreadState *tstate);
176176
extern int _PyEval_DisableGIL(PyThreadState *state);
177+
178+
179+
static inline _Py_CODEUNIT *
180+
_PyEval_GetExecutableCode(PyThreadState *tstate, PyCodeObject *co)
181+
{
182+
_Py_CODEUNIT *bc = _PyCode_GetTLBCFast(tstate, co);
183+
if (bc != NULL) {
184+
return bc;
185+
}
186+
return _PyCode_GetTLBC(co);
187+
}
188+
177189
#endif
178190

179191
extern void _PyEval_DeactivateOpCache(void);

Include/internal/pycore_code.h

+41
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ extern "C" {
1111
#include "pycore_stackref.h" // _PyStackRef
1212
#include "pycore_lock.h" // PyMutex
1313
#include "pycore_backoff.h" // _Py_BackoffCounter
14+
#include "pycore_tstate.h" // _PyThreadStateImpl
1415

1516

1617
/* Each instruction in a code object is a fixed-width value,
@@ -313,11 +314,17 @@ extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range);
313314
/** API for executors */
314315
extern void _PyCode_Clear_Executors(PyCodeObject *code);
315316

317+
316318
#ifdef Py_GIL_DISABLED
317319
// gh-115999 tracks progress on addressing this.
318320
#define ENABLE_SPECIALIZATION 0
321+
// Use this to enable specialization families once they are thread-safe. All
322+
// uses will be replaced with ENABLE_SPECIALIZATION once all families are
323+
// thread-safe.
324+
#define ENABLE_SPECIALIZATION_FT 1
319325
#else
320326
#define ENABLE_SPECIALIZATION 1
327+
#define ENABLE_SPECIALIZATION_FT ENABLE_SPECIALIZATION
321328
#endif
322329

323330
/* Specialization functions */
@@ -600,6 +607,40 @@ struct _PyCode8 _PyCode_DEF(8);
600607

601608
PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup;
602609

610+
#ifdef Py_GIL_DISABLED
611+
612+
// Return a pointer to the thread-local bytecode for the current thread, if it
613+
// exists.
614+
static inline _Py_CODEUNIT *
615+
_PyCode_GetTLBCFast(PyThreadState *tstate, PyCodeObject *co)
616+
{
617+
_PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_tlbc);
618+
int32_t idx = ((_PyThreadStateImpl*) tstate)->tlbc_index;
619+
if (idx < code->size && code->entries[idx] != NULL) {
620+
return (_Py_CODEUNIT *) code->entries[idx];
621+
}
622+
return NULL;
623+
}
624+
625+
// Return a pointer to the thread-local bytecode for the current thread,
626+
// creating it if necessary.
627+
extern _Py_CODEUNIT *_PyCode_GetTLBC(PyCodeObject *co);
628+
629+
// Reserve an index for the current thread into thread-local bytecode
630+
// arrays
631+
//
632+
// Returns the reserved index or -1 on error.
633+
extern int32_t _Py_ReserveTLBCIndex(PyInterpreterState *interp);
634+
635+
// Release the current thread's index into thread-local bytecode arrays
636+
extern void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate);
637+
638+
// Free all TLBC copies not associated with live threads.
639+
//
640+
// Returns 0 on success or -1 on error.
641+
extern int _Py_ClearUnusedTLBC(PyInterpreterState *interp);
642+
#endif
643+
603644
#ifdef __cplusplus
604645
}
605646
#endif

Include/internal/pycore_frame.h

+52-4
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,10 @@ typedef struct _PyInterpreterFrame {
6868
PyObject *f_locals; /* Strong reference, may be NULL. Only valid if not on C stack */
6969
PyFrameObject *frame_obj; /* Strong reference, may be NULL. Only valid if not on C stack */
7070
_Py_CODEUNIT *instr_ptr; /* Instruction currently executing (or about to begin) */
71+
#ifdef Py_GIL_DISABLED
72+
/* Index of thread-local bytecode containing instr_ptr. */
73+
int32_t tlbc_index;
74+
#endif
7175
_PyStackRef *stackpointer;
7276
uint16_t return_offset; /* Only relevant during a function call */
7377
char owner;
@@ -76,14 +80,27 @@ typedef struct _PyInterpreterFrame {
7680
} _PyInterpreterFrame;
7781

7882
#define _PyInterpreterFrame_LASTI(IF) \
79-
((int)((IF)->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(IF))))
83+
((int)((IF)->instr_ptr - _PyFrame_GetBytecode((IF))))
8084

8185
static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) {
8286
PyObject *executable = PyStackRef_AsPyObjectBorrow(f->f_executable);
8387
assert(PyCode_Check(executable));
8488
return (PyCodeObject *)executable;
8589
}
8690

91+
static inline _Py_CODEUNIT *
92+
_PyFrame_GetBytecode(_PyInterpreterFrame *f)
93+
{
94+
#ifdef Py_GIL_DISABLED
95+
PyCodeObject *co = _PyFrame_GetCode(f);
96+
_PyCodeArray *tlbc = _Py_atomic_load_ptr_acquire(&co->co_tlbc);
97+
assert(f->tlbc_index >= 0 && f->tlbc_index < tlbc->size);
98+
return (_Py_CODEUNIT *)tlbc->entries[f->tlbc_index];
99+
#else
100+
return _PyCode_CODE(_PyFrame_GetCode(f));
101+
#endif
102+
}
103+
87104
static inline PyFunctionObject *_PyFrame_GetFunction(_PyInterpreterFrame *f) {
88105
PyObject *func = PyStackRef_AsPyObjectBorrow(f->f_funcobj);
89106
assert(PyFunction_Check(func));
@@ -144,13 +161,33 @@ static inline void _PyFrame_Copy(_PyInterpreterFrame *src, _PyInterpreterFrame *
144161
#endif
145162
}
146163

164+
#ifdef Py_GIL_DISABLED
165+
static inline void
166+
_PyFrame_InitializeTLBC(PyThreadState *tstate, _PyInterpreterFrame *frame,
167+
PyCodeObject *code)
168+
{
169+
_Py_CODEUNIT *tlbc = _PyCode_GetTLBCFast(tstate, code);
170+
if (tlbc == NULL) {
171+
// No thread-local bytecode exists for this thread yet; use the main
172+
// thread's copy, deferring thread-local bytecode creation to the
173+
// execution of RESUME.
174+
frame->instr_ptr = _PyCode_CODE(code);
175+
frame->tlbc_index = 0;
176+
}
177+
else {
178+
frame->instr_ptr = tlbc;
179+
frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index;
180+
}
181+
}
182+
#endif
183+
147184
/* Consumes reference to func and locals.
148185
Does not initialize frame->previous, which happens
149186
when frame is linked into the frame stack.
150187
*/
151188
static inline void
152189
_PyFrame_Initialize(
153-
_PyInterpreterFrame *frame, _PyStackRef func,
190+
PyThreadState *tstate, _PyInterpreterFrame *frame, _PyStackRef func,
154191
PyObject *locals, PyCodeObject *code, int null_locals_from, _PyInterpreterFrame *previous)
155192
{
156193
frame->previous = previous;
@@ -162,7 +199,12 @@ _PyFrame_Initialize(
162199
frame->f_locals = locals;
163200
frame->stackpointer = frame->localsplus + code->co_nlocalsplus;
164201
frame->frame_obj = NULL;
202+
#ifdef Py_GIL_DISABLED
203+
_PyFrame_InitializeTLBC(tstate, frame, code);
204+
#else
205+
(void)tstate;
165206
frame->instr_ptr = _PyCode_CODE(code);
207+
#endif
166208
frame->return_offset = 0;
167209
frame->owner = FRAME_OWNED_BY_THREAD;
168210

@@ -224,7 +266,8 @@ _PyFrame_IsIncomplete(_PyInterpreterFrame *frame)
224266
return true;
225267
}
226268
return frame->owner != FRAME_OWNED_BY_GENERATOR &&
227-
frame->instr_ptr < _PyCode_CODE(_PyFrame_GetCode(frame)) + _PyFrame_GetCode(frame)->_co_firsttraceable;
269+
frame->instr_ptr < _PyFrame_GetBytecode(frame) +
270+
_PyFrame_GetCode(frame)->_co_firsttraceable;
228271
}
229272

230273
static inline _PyInterpreterFrame *
@@ -315,7 +358,8 @@ _PyFrame_PushUnchecked(PyThreadState *tstate, _PyStackRef func, int null_locals_
315358
_PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->datastack_top;
316359
tstate->datastack_top += code->co_framesize;
317360
assert(tstate->datastack_top < tstate->datastack_limit);
318-
_PyFrame_Initialize(new_frame, func, NULL, code, null_locals_from, previous);
361+
_PyFrame_Initialize(tstate, new_frame, func, NULL, code, null_locals_from,
362+
previous);
319363
return new_frame;
320364
}
321365

@@ -339,7 +383,11 @@ _PyFrame_PushTrampolineUnchecked(PyThreadState *tstate, PyCodeObject *code, int
339383
assert(stackdepth <= code->co_stacksize);
340384
frame->stackpointer = frame->localsplus + code->co_nlocalsplus + stackdepth;
341385
frame->frame_obj = NULL;
386+
#ifdef Py_GIL_DISABLED
387+
_PyFrame_InitializeTLBC(tstate, frame, code);
388+
#else
342389
frame->instr_ptr = _PyCode_CODE(code);
390+
#endif
343391
frame->owner = FRAME_OWNED_BY_THREAD;
344392
frame->return_offset = 0;
345393

Include/internal/pycore_gc.h

+4
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,10 @@ extern int _PyGC_VisitStackRef(union _PyStackRef *ref, visitproc visit, void *ar
389389
} \
390390
} while (0)
391391

392+
#ifdef Py_GIL_DISABLED
393+
extern void _PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp,
394+
gcvisitobjects_t callback, void *arg);
395+
#endif
392396

393397
#ifdef __cplusplus
394398
}

Include/internal/pycore_index_pool.h

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#ifndef Py_INTERNAL_INDEX_POOL_H
2+
#define Py_INTERNAL_INDEX_POOL_H
3+
4+
#include "Python.h"
5+
6+
#ifdef __cplusplus
7+
extern "C" {
8+
#endif
9+
10+
#ifndef Py_BUILD_CORE
11+
# error "this header requires Py_BUILD_CORE define"
12+
#endif
13+
14+
#ifdef Py_GIL_DISABLED
15+
16+
// This contains code for allocating unique indices in an array. It is used by
17+
// the free-threaded build to assign each thread a globally unique index into
18+
// each code object's thread-local bytecode array.
19+
20+
// A min-heap of indices
21+
typedef struct _PyIndexHeap {
22+
int32_t *values;
23+
24+
// Number of items stored in values
25+
Py_ssize_t size;
26+
27+
// Maximum number of items that can be stored in values
28+
Py_ssize_t capacity;
29+
} _PyIndexHeap;
30+
31+
// An unbounded pool of indices. Indices are allocated starting from 0. They
32+
// may be released back to the pool once they are no longer in use.
33+
typedef struct _PyIndexPool {
34+
PyMutex mutex;
35+
36+
// Min heap of indices available for allocation
37+
_PyIndexHeap free_indices;
38+
39+
// Next index to allocate if no free indices are available
40+
int32_t next_index;
41+
} _PyIndexPool;
42+
43+
// Allocate the smallest available index. Returns -1 on error.
44+
extern int32_t _PyIndexPool_AllocIndex(_PyIndexPool *indices);
45+
46+
// Release `index` back to the pool
47+
extern void _PyIndexPool_FreeIndex(_PyIndexPool *indices, int32_t index);
48+
49+
extern void _PyIndexPool_Fini(_PyIndexPool *indices);
50+
51+
#endif // Py_GIL_DISABLED
52+
53+
#ifdef __cplusplus
54+
}
55+
#endif
56+
#endif // !Py_INTERNAL_INDEX_POOL_H

Include/internal/pycore_interp.h

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ extern "C" {
2626
#include "pycore_genobject.h" // _PyGen_FetchStopIterationValue
2727
#include "pycore_global_objects.h"// struct _Py_interp_cached_objects
2828
#include "pycore_import.h" // struct _import_state
29+
#include "pycore_index_pool.h" // _PyIndexPool
2930
#include "pycore_instruments.h" // _PY_MONITORING_EVENTS
3031
#include "pycore_list.h" // struct _Py_list_state
3132
#include "pycore_mimalloc.h" // struct _mimalloc_interp_state
@@ -222,6 +223,7 @@ struct _is {
222223
struct _brc_state brc; // biased reference counting state
223224
struct _Py_unique_id_pool unique_ids; // object ids for per-thread refcounts
224225
PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS];
226+
_PyIndexPool tlbc_indices;
225227
#endif
226228

227229
// Per-interpreter state for the obmalloc allocator. For the main

Include/internal/pycore_tstate.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ typedef struct _PyThreadStateImpl {
4242
int is_finalized;
4343
} refcounts;
4444

45+
// Index to use to retrieve thread-local bytecode for this thread
46+
int32_t tlbc_index;
47+
4548
// When >1, code objects do not immortalize their non-string constants.
4649
int suppress_co_const_immortalization;
4750
#endif
@@ -52,7 +55,6 @@ typedef struct _PyThreadStateImpl {
5255

5356
} _PyThreadStateImpl;
5457

55-
5658
#ifdef __cplusplus
5759
}
5860
#endif

0 commit comments

Comments
 (0)