Skip to content

Commit 9621a7d

Browse files
authored
GH-118093: Handle some polymorphism before requiring progress in tier two (GH-122843)
1 parent 503af8f commit 9621a7d

File tree

6 files changed

+73
-42
lines changed

6 files changed

+73
-42
lines changed

Include/internal/pycore_optimizer.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@ typedef struct {
2929
typedef struct {
3030
uint8_t opcode;
3131
uint8_t oparg;
32-
uint8_t valid;
33-
uint8_t linked;
32+
uint16_t valid:1;
33+
uint16_t linked:1;
34+
uint16_t chain_depth:14; // Must be big engough for MAX_CHAIN_DEPTH - 1.
3435
int index; // Index of ENTER_EXECUTOR (if code isn't NULL, below).
3536
_PyBloomFilter bloom;
3637
_PyExecutorLinkListNode links;
@@ -83,7 +84,7 @@ typedef struct _PyOptimizerObject _PyOptimizerObject;
8384
typedef int (*_Py_optimize_func)(
8485
_PyOptimizerObject* self, struct _PyInterpreterFrame *frame,
8586
_Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr,
86-
int curr_stackentries);
87+
int curr_stackentries, bool progress_needed);
8788

8889
struct _PyOptimizerObject {
8990
PyObject_HEAD
@@ -182,6 +183,12 @@ static inline uint16_t uop_get_error_target(const _PyUOpInstruction *inst)
182183
// Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH())
183184
#define MAX_ABSTRACT_FRAME_DEPTH (TRACE_STACK_SIZE + 2)
184185

186+
// The maximum number of side exits that we can take before requiring forward
187+
// progress (and inserting a new ENTER_EXECUTOR instruction). In practice, this
188+
// is the "maximum amount of polymorphism" that an isolated trace tree can
189+
// handle before rejoining the rest of the program.
190+
#define MAX_CHAIN_DEPTH 4
191+
185192
typedef struct _Py_UopsSymbol _Py_UopsSymbol;
186193

187194
struct _Py_UOpsAbstractFrame {
@@ -257,7 +264,7 @@ extern int _Py_uop_frame_pop(_Py_UOpsContext *ctx);
257264

258265
PyAPI_FUNC(PyObject *) _Py_uop_symbols_test(PyObject *self, PyObject *ignored);
259266

260-
PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **exec_ptr);
267+
PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **exec_ptr, int chain_depth);
261268

262269
static inline int is_terminator(const _PyUOpInstruction *uop)
263270
{
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve the experimental JIT's handling of polymorphic code.

Python/bytecodes.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2501,7 +2501,7 @@ dummy_func(
25012501
start--;
25022502
}
25032503
_PyExecutorObject *executor;
2504-
int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor);
2504+
int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, 0);
25052505
ERROR_IF(optimized < 0, error);
25062506
if (optimized) {
25072507
assert(tstate->previous_executor == NULL);
@@ -4543,7 +4543,8 @@ dummy_func(
45434543
Py_INCREF(executor);
45444544
}
45454545
else {
4546-
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor);
4546+
int chain_depth = current_executor->vm_data.chain_depth + 1;
4547+
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, chain_depth);
45474548
if (optimized <= 0) {
45484549
exit->temperature = restart_backoff_counter(temperature);
45494550
if (optimized < 0) {
@@ -4626,7 +4627,7 @@ dummy_func(
46264627
exit->temperature = advance_backoff_counter(exit->temperature);
46274628
GOTO_TIER_ONE(target);
46284629
}
4629-
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor);
4630+
int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, 0);
46304631
if (optimized <= 0) {
46314632
exit->temperature = restart_backoff_counter(exit->temperature);
46324633
if (optimized < 0) {

Python/executor_cases.c.h

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/generated_cases.c.h

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Python/optimizer.c

Lines changed: 53 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ never_optimize(
111111
_PyInterpreterFrame *frame,
112112
_Py_CODEUNIT *instr,
113113
_PyExecutorObject **exec,
114-
int Py_UNUSED(stack_entries))
114+
int Py_UNUSED(stack_entries),
115+
bool Py_UNUSED(progress_needed))
115116
{
116117
// This may be called if the optimizer is reset
117118
return 0;
@@ -176,32 +177,44 @@ _Py_SetTier2Optimizer(_PyOptimizerObject *optimizer)
176177
int
177178
_PyOptimizer_Optimize(
178179
_PyInterpreterFrame *frame, _Py_CODEUNIT *start,
179-
_PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr)
180+
_PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr, int chain_depth)
180181
{
182+
// The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must*
183+
// make progress in order to avoid infinite loops or excessively-long
184+
// side-exit chains. We can only insert the executor into the bytecode if
185+
// this is true, since a deopt won't infinitely re-enter the executor:
186+
chain_depth %= MAX_CHAIN_DEPTH;
187+
bool progress_needed = chain_depth == 0;
181188
PyCodeObject *code = _PyFrame_GetCode(frame);
182189
assert(PyCode_Check(code));
183190
PyInterpreterState *interp = _PyInterpreterState_GET();
184-
if (!has_space_for_executor(code, start)) {
191+
if (progress_needed && !has_space_for_executor(code, start)) {
185192
return 0;
186193
}
187194
_PyOptimizerObject *opt = interp->optimizer;
188-
int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame)));
195+
int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame)), progress_needed);
189196
if (err <= 0) {
190197
return err;
191198
}
192199
assert(*executor_ptr != NULL);
193-
int index = get_index_for_executor(code, start);
194-
if (index < 0) {
195-
/* Out of memory. Don't raise and assume that the
196-
* error will show up elsewhere.
197-
*
198-
* If an optimizer has already produced an executor,
199-
* it might get confused by the executor disappearing,
200-
* but there is not much we can do about that here. */
201-
Py_DECREF(*executor_ptr);
202-
return 0;
200+
if (progress_needed) {
201+
int index = get_index_for_executor(code, start);
202+
if (index < 0) {
203+
/* Out of memory. Don't raise and assume that the
204+
* error will show up elsewhere.
205+
*
206+
* If an optimizer has already produced an executor,
207+
* it might get confused by the executor disappearing,
208+
* but there is not much we can do about that here. */
209+
Py_DECREF(*executor_ptr);
210+
return 0;
211+
}
212+
insert_executor(code, start, index, *executor_ptr);
203213
}
204-
insert_executor(code, start, index, *executor_ptr);
214+
else {
215+
(*executor_ptr)->vm_data.code = NULL;
216+
}
217+
(*executor_ptr)->vm_data.chain_depth = chain_depth;
205218
assert((*executor_ptr)->vm_data.valid);
206219
return 1;
207220
}
@@ -530,9 +543,9 @@ translate_bytecode_to_trace(
530543
_Py_CODEUNIT *instr,
531544
_PyUOpInstruction *trace,
532545
int buffer_size,
533-
_PyBloomFilter *dependencies)
546+
_PyBloomFilter *dependencies, bool progress_needed)
534547
{
535-
bool progress_needed = true;
548+
bool first = true;
536549
PyCodeObject *code = _PyFrame_GetCode(frame);
537550
PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj;
538551
assert(PyFunction_Check(func));
@@ -576,7 +589,7 @@ translate_bytecode_to_trace(
576589
uint32_t opcode = instr->op.code;
577590
uint32_t oparg = instr->op.arg;
578591

579-
if (!progress_needed && instr == initial_instr) {
592+
if (!first && instr == initial_instr) {
580593
// We have looped around to the start:
581594
RESERVE(1);
582595
ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
@@ -585,14 +598,6 @@ translate_bytecode_to_trace(
585598

586599
DPRINTF(2, "%d: %s(%d)\n", target, _PyOpcode_OpName[opcode], oparg);
587600

588-
if (opcode == ENTER_EXECUTOR) {
589-
assert(oparg < 256);
590-
_PyExecutorObject *executor = code->co_executors->executors[oparg];
591-
opcode = executor->vm_data.opcode;
592-
DPRINTF(2, " * ENTER_EXECUTOR -> %s\n", _PyOpcode_OpName[opcode]);
593-
oparg = executor->vm_data.oparg;
594-
}
595-
596601
if (opcode == EXTENDED_ARG) {
597602
instr++;
598603
opcode = instr->op.code;
@@ -602,13 +607,27 @@ translate_bytecode_to_trace(
602607
goto done;
603608
}
604609
}
610+
if (opcode == ENTER_EXECUTOR) {
611+
// We have a couple of options here. We *could* peek "underneath"
612+
// this executor and continue tracing, which could give us a longer,
613+
// more optimizeable trace (at the expense of lots of duplicated
614+
// tier two code). Instead, we choose to just end here and stitch to
615+
// the other trace, which allows a side-exit traces to rejoin the
616+
// "main" trace periodically (and also helps protect us against
617+
// pathological behavior where the amount of tier two code explodes
618+
// for a medium-length, branchy code path). This seems to work
619+
// better in practice, but in the future we could be smarter about
620+
// what we do here:
621+
goto done;
622+
}
605623
assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG);
606624
RESERVE_RAW(2, "_CHECK_VALIDITY_AND_SET_IP");
607625
ADD_TO_TRACE(_CHECK_VALIDITY_AND_SET_IP, 0, (uintptr_t)instr, target);
608626

609627
/* Special case the first instruction,
610628
* so that we can guarantee forward progress */
611-
if (progress_needed) {
629+
if (first && progress_needed) {
630+
assert(first);
612631
if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) {
613632
opcode = _PyOpcode_Deopt[opcode];
614633
}
@@ -903,7 +922,7 @@ translate_bytecode_to_trace(
903922
}
904923
top:
905924
// Jump here after _PUSH_FRAME or likely branches.
906-
progress_needed = false;
925+
first = false;
907926
} // End for (;;)
908927

909928
done:
@@ -912,7 +931,7 @@ translate_bytecode_to_trace(
912931
}
913932
assert(code == initial_code);
914933
// Skip short traces where we can't even translate a single instruction:
915-
if (progress_needed) {
934+
if (first) {
916935
OPT_STAT_INC(trace_too_short);
917936
DPRINTF(2,
918937
"No trace for %s (%s:%d) at byte offset %d (no progress)\n",
@@ -1225,13 +1244,14 @@ uop_optimize(
12251244
_PyInterpreterFrame *frame,
12261245
_Py_CODEUNIT *instr,
12271246
_PyExecutorObject **exec_ptr,
1228-
int curr_stackentries)
1247+
int curr_stackentries,
1248+
bool progress_needed)
12291249
{
12301250
_PyBloomFilter dependencies;
12311251
_Py_BloomFilter_Init(&dependencies);
12321252
_PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH];
12331253
OPT_STAT_INC(attempts);
1234-
int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies);
1254+
int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies, progress_needed);
12351255
if (length <= 0) {
12361256
// Error or nothing translated
12371257
return length;
@@ -1328,7 +1348,8 @@ counter_optimize(
13281348
_PyInterpreterFrame *frame,
13291349
_Py_CODEUNIT *instr,
13301350
_PyExecutorObject **exec_ptr,
1331-
int Py_UNUSED(curr_stackentries)
1351+
int Py_UNUSED(curr_stackentries),
1352+
bool Py_UNUSED(progress_needed)
13321353
)
13331354
{
13341355
PyCodeObject *code = _PyFrame_GetCode(frame);

0 commit comments

Comments
 (0)