@@ -111,7 +111,8 @@ never_optimize(
111
111
_PyInterpreterFrame * frame ,
112
112
_Py_CODEUNIT * instr ,
113
113
_PyExecutorObject * * exec ,
114
- int Py_UNUSED (stack_entries ))
114
+ int Py_UNUSED (stack_entries ),
115
+ bool Py_UNUSED (progress_needed ))
115
116
{
116
117
// This may be called if the optimizer is reset
117
118
return 0 ;
@@ -176,32 +177,44 @@ _Py_SetTier2Optimizer(_PyOptimizerObject *optimizer)
176
177
int
177
178
_PyOptimizer_Optimize (
178
179
_PyInterpreterFrame * frame , _Py_CODEUNIT * start ,
179
- _PyStackRef * stack_pointer , _PyExecutorObject * * executor_ptr )
180
+ _PyStackRef * stack_pointer , _PyExecutorObject * * executor_ptr , int chain_depth )
180
181
{
182
+ // The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must*
183
+ // make progress in order to avoid infinite loops or excessively-long
184
+ // side-exit chains. We can only insert the executor into the bytecode if
185
+ // this is true, since a deopt won't infinitely re-enter the executor:
186
+ chain_depth %= MAX_CHAIN_DEPTH ;
187
+ bool progress_needed = chain_depth == 0 ;
181
188
PyCodeObject * code = _PyFrame_GetCode (frame );
182
189
assert (PyCode_Check (code ));
183
190
PyInterpreterState * interp = _PyInterpreterState_GET ();
184
- if (!has_space_for_executor (code , start )) {
191
+ if (progress_needed && !has_space_for_executor (code , start )) {
185
192
return 0 ;
186
193
}
187
194
_PyOptimizerObject * opt = interp -> optimizer ;
188
- int err = opt -> optimize (opt , frame , start , executor_ptr , (int )(stack_pointer - _PyFrame_Stackbase (frame )));
195
+ int err = opt -> optimize (opt , frame , start , executor_ptr , (int )(stack_pointer - _PyFrame_Stackbase (frame )), progress_needed );
189
196
if (err <= 0 ) {
190
197
return err ;
191
198
}
192
199
assert (* executor_ptr != NULL );
193
- int index = get_index_for_executor (code , start );
194
- if (index < 0 ) {
195
- /* Out of memory. Don't raise and assume that the
196
- * error will show up elsewhere.
197
- *
198
- * If an optimizer has already produced an executor,
199
- * it might get confused by the executor disappearing,
200
- * but there is not much we can do about that here. */
201
- Py_DECREF (* executor_ptr );
202
- return 0 ;
200
+ if (progress_needed ) {
201
+ int index = get_index_for_executor (code , start );
202
+ if (index < 0 ) {
203
+ /* Out of memory. Don't raise and assume that the
204
+ * error will show up elsewhere.
205
+ *
206
+ * If an optimizer has already produced an executor,
207
+ * it might get confused by the executor disappearing,
208
+ * but there is not much we can do about that here. */
209
+ Py_DECREF (* executor_ptr );
210
+ return 0 ;
211
+ }
212
+ insert_executor (code , start , index , * executor_ptr );
203
213
}
204
- insert_executor (code , start , index , * executor_ptr );
214
+ else {
215
+ (* executor_ptr )-> vm_data .code = NULL ;
216
+ }
217
+ (* executor_ptr )-> vm_data .chain_depth = chain_depth ;
205
218
assert ((* executor_ptr )-> vm_data .valid );
206
219
return 1 ;
207
220
}
@@ -530,9 +543,9 @@ translate_bytecode_to_trace(
530
543
_Py_CODEUNIT * instr ,
531
544
_PyUOpInstruction * trace ,
532
545
int buffer_size ,
533
- _PyBloomFilter * dependencies )
546
+ _PyBloomFilter * dependencies , bool progress_needed )
534
547
{
535
- bool progress_needed = true;
548
+ bool first = true;
536
549
PyCodeObject * code = _PyFrame_GetCode (frame );
537
550
PyFunctionObject * func = (PyFunctionObject * )frame -> f_funcobj ;
538
551
assert (PyFunction_Check (func ));
@@ -576,7 +589,7 @@ translate_bytecode_to_trace(
576
589
uint32_t opcode = instr -> op .code ;
577
590
uint32_t oparg = instr -> op .arg ;
578
591
579
- if (!progress_needed && instr == initial_instr ) {
592
+ if (!first && instr == initial_instr ) {
580
593
// We have looped around to the start:
581
594
RESERVE (1 );
582
595
ADD_TO_TRACE (_JUMP_TO_TOP , 0 , 0 , 0 );
@@ -585,14 +598,6 @@ translate_bytecode_to_trace(
585
598
586
599
DPRINTF (2 , "%d: %s(%d)\n" , target , _PyOpcode_OpName [opcode ], oparg );
587
600
588
- if (opcode == ENTER_EXECUTOR ) {
589
- assert (oparg < 256 );
590
- _PyExecutorObject * executor = code -> co_executors -> executors [oparg ];
591
- opcode = executor -> vm_data .opcode ;
592
- DPRINTF (2 , " * ENTER_EXECUTOR -> %s\n" , _PyOpcode_OpName [opcode ]);
593
- oparg = executor -> vm_data .oparg ;
594
- }
595
-
596
601
if (opcode == EXTENDED_ARG ) {
597
602
instr ++ ;
598
603
opcode = instr -> op .code ;
@@ -602,13 +607,27 @@ translate_bytecode_to_trace(
602
607
goto done ;
603
608
}
604
609
}
610
+ if (opcode == ENTER_EXECUTOR ) {
611
+ // We have a couple of options here. We *could* peek "underneath"
612
+ // this executor and continue tracing, which could give us a longer,
613
+ // more optimizeable trace (at the expense of lots of duplicated
614
+ // tier two code). Instead, we choose to just end here and stitch to
615
+ // the other trace, which allows a side-exit traces to rejoin the
616
+ // "main" trace periodically (and also helps protect us against
617
+ // pathological behavior where the amount of tier two code explodes
618
+ // for a medium-length, branchy code path). This seems to work
619
+ // better in practice, but in the future we could be smarter about
620
+ // what we do here:
621
+ goto done ;
622
+ }
605
623
assert (opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG );
606
624
RESERVE_RAW (2 , "_CHECK_VALIDITY_AND_SET_IP" );
607
625
ADD_TO_TRACE (_CHECK_VALIDITY_AND_SET_IP , 0 , (uintptr_t )instr , target );
608
626
609
627
/* Special case the first instruction,
610
628
* so that we can guarantee forward progress */
611
- if (progress_needed ) {
629
+ if (first && progress_needed ) {
630
+ assert (first );
612
631
if (OPCODE_HAS_EXIT (opcode ) || OPCODE_HAS_DEOPT (opcode )) {
613
632
opcode = _PyOpcode_Deopt [opcode ];
614
633
}
@@ -903,7 +922,7 @@ translate_bytecode_to_trace(
903
922
}
904
923
top :
905
924
// Jump here after _PUSH_FRAME or likely branches.
906
- progress_needed = false;
925
+ first = false;
907
926
} // End for (;;)
908
927
909
928
done :
@@ -912,7 +931,7 @@ translate_bytecode_to_trace(
912
931
}
913
932
assert (code == initial_code );
914
933
// Skip short traces where we can't even translate a single instruction:
915
- if (progress_needed ) {
934
+ if (first ) {
916
935
OPT_STAT_INC (trace_too_short );
917
936
DPRINTF (2 ,
918
937
"No trace for %s (%s:%d) at byte offset %d (no progress)\n" ,
@@ -1225,13 +1244,14 @@ uop_optimize(
1225
1244
_PyInterpreterFrame * frame ,
1226
1245
_Py_CODEUNIT * instr ,
1227
1246
_PyExecutorObject * * exec_ptr ,
1228
- int curr_stackentries )
1247
+ int curr_stackentries ,
1248
+ bool progress_needed )
1229
1249
{
1230
1250
_PyBloomFilter dependencies ;
1231
1251
_Py_BloomFilter_Init (& dependencies );
1232
1252
_PyUOpInstruction buffer [UOP_MAX_TRACE_LENGTH ];
1233
1253
OPT_STAT_INC (attempts );
1234
- int length = translate_bytecode_to_trace (frame , instr , buffer , UOP_MAX_TRACE_LENGTH , & dependencies );
1254
+ int length = translate_bytecode_to_trace (frame , instr , buffer , UOP_MAX_TRACE_LENGTH , & dependencies , progress_needed );
1235
1255
if (length <= 0 ) {
1236
1256
// Error or nothing translated
1237
1257
return length ;
@@ -1328,7 +1348,8 @@ counter_optimize(
1328
1348
_PyInterpreterFrame * frame ,
1329
1349
_Py_CODEUNIT * instr ,
1330
1350
_PyExecutorObject * * exec_ptr ,
1331
- int Py_UNUSED (curr_stackentries )
1351
+ int Py_UNUSED (curr_stackentries ),
1352
+ bool Py_UNUSED (progress_needed )
1332
1353
)
1333
1354
{
1334
1355
PyCodeObject * code = _PyFrame_GetCode (frame );
0 commit comments