@@ -62,6 +62,8 @@ STATISTIC(NumUnscaledPairCreated,
6262 " Number of load/store from unscaled generated" );
6363STATISTIC (NumZeroStoresPromoted, " Number of narrow zero stores promoted" );
6464STATISTIC (NumLoadsFromStoresPromoted, " Number of loads from stores promoted" );
65+ STATISTIC (NumConstOffsetFolded,
66+ " Number of const offset of index address folded" );
6567
6668DEBUG_COUNTER (RegRenamingCounter, DEBUG_TYPE " -reg-renaming" ,
6769 " Controls which pairs are considered for renaming" );
@@ -75,6 +77,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
7577static cl::opt<unsigned > UpdateLimit (" aarch64-update-scan-limit" , cl::init(100 ),
7678 cl::Hidden);
7779
80+ // The LdStConstLimit limits how far we search for const offset instructions
81+ // when we form index address load/store instructions.
82+ static cl::opt<unsigned > LdStConstLimit (" aarch64-load-store-const-scan-limit" ,
83+ cl::init (10 ), cl::Hidden);
84+
7885// Enable register renaming to find additional store pairing opportunities.
7986static cl::opt<bool > EnableRenaming (" aarch64-load-store-renaming" ,
8087 cl::init (true ), cl::Hidden);
@@ -171,6 +178,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
171178 findMatchingUpdateInsnForward (MachineBasicBlock::iterator I,
172179 int UnscaledOffset, unsigned Limit);
173180
181+ // Scan the instruction list to find a register assigned with a const
182+ // value that can be combined with the current instruction (a load or store)
183+ // using base addressing with writeback. Scan forwards.
184+ MachineBasicBlock::iterator
185+ findMatchingConstOffsetBackward (MachineBasicBlock::iterator I, unsigned Limit,
186+ unsigned &Offset);
187+
174188 // Scan the instruction list to find a base register update that can
175189 // be combined with the current instruction (a load or store) using
176190 // pre or post indexed addressing with writeback. Scan backwards.
@@ -182,11 +196,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
182196 bool isMatchingUpdateInsn (MachineInstr &MemMI, MachineInstr &MI,
183197 unsigned BaseReg, int Offset);
184198
199+ bool isMatchingMovConstInsn (MachineInstr &MemMI, MachineInstr &MI,
200+ unsigned IndexReg, unsigned &Offset);
201+
185202 // Merge a pre- or post-index base register update into a ld/st instruction.
186203 MachineBasicBlock::iterator
187204 mergeUpdateInsn (MachineBasicBlock::iterator I,
188205 MachineBasicBlock::iterator Update, bool IsPreIdx);
189206
207+ MachineBasicBlock::iterator
208+ mergeConstOffsetInsn (MachineBasicBlock::iterator I,
209+ MachineBasicBlock::iterator Update, unsigned Offset,
210+ int Scale);
211+
190212 // Find and merge zero store instructions.
191213 bool tryToMergeZeroStInst (MachineBasicBlock::iterator &MBBI);
192214
@@ -199,6 +221,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
199221 // Find and merge a base register updates before or after a ld/st instruction.
200222 bool tryToMergeLdStUpdate (MachineBasicBlock::iterator &MBBI);
201223
224+ // Find and merge a index ldr/st instructions into a base ld/st instruction.
225+ bool tryToMergeIndexLdSt (MachineBasicBlock::iterator &MBBI, int Scale);
226+
202227 bool optimizeBlock (MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
203228
204229 bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -481,6 +506,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
481506 }
482507}
483508
509+ static unsigned getBaseAddressOpcode (unsigned Opc) {
510+ // TODO: Add more index address loads/stores.
511+ switch (Opc) {
512+ default :
513+ llvm_unreachable (" Opcode has no base address equivalent!" );
514+ case AArch64::LDRBBroX:
515+ return AArch64::LDRBBui;
516+ }
517+ }
518+
484519static unsigned getPostIndexedOpcode (unsigned Opc) {
485520 switch (Opc) {
486521 default :
@@ -722,6 +757,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
722757 }
723758}
724759
760+ // Make sure this is a reg+reg Ld/St
761+ static bool isMergeableIndexLdSt (MachineInstr &MI, int &Scale) {
762+ unsigned Opc = MI.getOpcode ();
763+ switch (Opc) {
764+ default :
765+ return false ;
766+ // Scaled instructions.
767+ // TODO: Add more index address loads/stores.
768+ case AArch64::LDRBBroX:
769+ Scale = 1 ;
770+ return true ;
771+ }
772+ }
773+
725774static bool isRewritableImplicitDef (unsigned Opc) {
726775 switch (Opc) {
727776 default :
@@ -2018,6 +2067,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
20182067 return NextI;
20192068}
20202069
2070+ MachineBasicBlock::iterator
2071+ AArch64LoadStoreOpt::mergeConstOffsetInsn (MachineBasicBlock::iterator I,
2072+ MachineBasicBlock::iterator Update,
2073+ unsigned Offset, int Scale) {
2074+ assert ((Update->getOpcode () == AArch64::MOVKWi) &&
2075+ " Unexpected const mov instruction to merge!" );
2076+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2077+ MachineBasicBlock::iterator NextI = next_nodbg (I, E);
2078+ MachineBasicBlock::iterator PrevI = prev_nodbg (Update, E);
2079+ MachineInstr &MemMI = *I;
2080+ unsigned Mask = (1 << 12 ) * Scale - 1 ;
2081+ unsigned Low = Offset & Mask;
2082+ unsigned High = Offset - Low;
2083+ Register BaseReg = AArch64InstrInfo::getLdStBaseOp (MemMI).getReg ();
2084+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp (MemMI).getReg ();
2085+ MachineInstrBuilder AddMIB, MemMIB;
2086+
2087+ // Add IndexReg, BaseReg, High (the BaseReg may be SP)
2088+ AddMIB =
2089+ BuildMI (*I->getParent (), I, I->getDebugLoc (), TII->get (AArch64::ADDXri))
2090+ .addDef (IndexReg)
2091+ .addUse (BaseReg)
2092+ .addImm (High >> 12 ) // shifted value
2093+ .addImm (12 ); // shift 12
2094+ (void )AddMIB;
2095+ // Ld/St DestReg, IndexReg, Imm12
2096+ unsigned NewOpc = getBaseAddressOpcode (I->getOpcode ());
2097+ MemMIB = BuildMI (*I->getParent (), I, I->getDebugLoc (), TII->get (NewOpc))
2098+ .add (getLdStRegOp (MemMI))
2099+ .add (AArch64InstrInfo::getLdStOffsetOp (MemMI))
2100+ .addImm (Low / Scale)
2101+ .setMemRefs (I->memoperands ())
2102+ .setMIFlags (I->mergeFlagsWith (*Update));
2103+ (void )MemMIB;
2104+
2105+ ++NumConstOffsetFolded;
2106+ LLVM_DEBUG (dbgs () << " Creating base address load/store.\n " );
2107+ LLVM_DEBUG (dbgs () << " Replacing instructions:\n " );
2108+ LLVM_DEBUG (PrevI->print (dbgs ()));
2109+ LLVM_DEBUG (dbgs () << " " );
2110+ LLVM_DEBUG (Update->print (dbgs ()));
2111+ LLVM_DEBUG (dbgs () << " " );
2112+ LLVM_DEBUG (I->print (dbgs ()));
2113+ LLVM_DEBUG (dbgs () << " with instruction:\n " );
2114+ LLVM_DEBUG (((MachineInstr *)AddMIB)->print (dbgs ()));
2115+ LLVM_DEBUG (dbgs () << " " );
2116+ LLVM_DEBUG (((MachineInstr *)MemMIB)->print (dbgs ()));
2117+ LLVM_DEBUG (dbgs () << " \n " );
2118+
2119+ // Erase the old instructions for the block.
2120+ I->eraseFromParent ();
2121+ PrevI->eraseFromParent ();
2122+ Update->eraseFromParent ();
2123+
2124+ return NextI;
2125+ }
2126+
20212127bool AArch64LoadStoreOpt::isMatchingUpdateInsn (MachineInstr &MemMI,
20222128 MachineInstr &MI,
20232129 unsigned BaseReg, int Offset) {
@@ -2065,6 +2171,31 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
20652171 return false ;
20662172}
20672173
2174+ bool AArch64LoadStoreOpt::isMatchingMovConstInsn (MachineInstr &MemMI,
2175+ MachineInstr &MI,
2176+ unsigned IndexReg,
2177+ unsigned &Offset) {
2178+ // The update instruction source and destination register must be the
2179+ // same as the load/store index register.
2180+ if (MI.getOpcode () == AArch64::MOVKWi &&
2181+ TRI->isSuperOrSubRegisterEq (IndexReg, MI.getOperand (1 ).getReg ())) {
2182+
2183+ // movz + movk hold a large offset of a Ld/St instruction.
2184+ MachineBasicBlock::iterator B = MI.getParent ()->begin ();
2185+ MachineBasicBlock::iterator MBBI = &MI;
2186+ MBBI = prev_nodbg (MBBI, B);
2187+ MachineInstr &MovzMI = *MBBI;
2188+ if (MovzMI.getOpcode () == AArch64::MOVZWi) {
2189+ unsigned Low = MovzMI.getOperand (1 ).getImm ();
2190+ unsigned High = MI.getOperand (2 ).getImm () << MI.getOperand (3 ).getImm ();
2191+ Offset = High + Low;
2192+ // 12-bit optionally shifted immediates are legal for adds.
2193+ return Offset >> 24 == 0 ;
2194+ }
2195+ }
2196+ return false ;
2197+ }
2198+
20682199MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward (
20692200 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
20702201 MachineBasicBlock::iterator E = I->getParent ()->end ();
@@ -2220,6 +2351,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
22202351 return E;
22212352}
22222353
2354+ MachineBasicBlock::iterator
2355+ AArch64LoadStoreOpt::findMatchingConstOffsetBackward (
2356+ MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2357+ MachineBasicBlock::iterator B = I->getParent ()->begin ();
2358+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2359+ MachineInstr &MemMI = *I;
2360+ MachineBasicBlock::iterator MBBI = I;
2361+
2362+ // If the load is the first instruction in the block, there's obviously
2363+ // not any matching load or store.
2364+ if (MBBI == B)
2365+ return E;
2366+
2367+ // Make sure the IndexReg is killed and the shift amount is zero.
2368+ // TODO: Relex this restriction to extend, simplify processing now.
2369+ if (!AArch64InstrInfo::getLdStOffsetOp (MemMI).isKill () ||
2370+ !AArch64InstrInfo::getLdStAmountOp (MemMI).isImm () ||
2371+ (AArch64InstrInfo::getLdStAmountOp (MemMI).getImm () != 0 ))
2372+ return E;
2373+
2374+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp (MemMI).getReg ();
2375+
2376+ // Track which register units have been modified and used between the first
2377+ // insn (inclusive) and the second insn.
2378+ ModifiedRegUnits.clear ();
2379+ UsedRegUnits.clear ();
2380+ unsigned Count = 0 ;
2381+ do {
2382+ MBBI = prev_nodbg (MBBI, B);
2383+ MachineInstr &MI = *MBBI;
2384+
2385+ // Don't count transient instructions towards the search limit since there
2386+ // may be different numbers of them if e.g. debug information is present.
2387+ if (!MI.isTransient ())
2388+ ++Count;
2389+
2390+ // If we found a match, return it.
2391+ if (isMatchingMovConstInsn (*I, MI, IndexReg, Offset)) {
2392+ return MBBI;
2393+ }
2394+
2395+ // Update the status of what the instruction clobbered and used.
2396+ LiveRegUnits::accumulateUsedDefed (MI, ModifiedRegUnits, UsedRegUnits, TRI);
2397+
2398+ // Otherwise, if the index register is used or modified, we have no match,
2399+ // so return early.
2400+ if (!ModifiedRegUnits.available (IndexReg) ||
2401+ !UsedRegUnits.available (IndexReg))
2402+ return E;
2403+
2404+ } while (MBBI != B && Count < Limit);
2405+ return E;
2406+ }
2407+
22232408bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore (
22242409 MachineBasicBlock::iterator &MBBI) {
22252410 MachineInstr &MI = *MBBI;
@@ -2404,6 +2589,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
24042589 return false ;
24052590}
24062591
2592+ bool AArch64LoadStoreOpt::tryToMergeIndexLdSt (MachineBasicBlock::iterator &MBBI,
2593+ int Scale) {
2594+ MachineInstr &MI = *MBBI;
2595+ MachineBasicBlock::iterator E = MI.getParent ()->end ();
2596+ MachineBasicBlock::iterator Update;
2597+
2598+ // Don't know how to handle unscaled pre/post-index versions below, so bail.
2599+ if (TII->hasUnscaledLdStOffset (MI.getOpcode ()))
2600+ return false ;
2601+
2602+ // Look back to try to find a const offset for index LdSt instruction. For
2603+ // example,
2604+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
2605+ // ldr x1, [x0, x8]
2606+ // merged into:
2607+ // add x8, x0, a * (1<<12)
2608+ // ldr x1, [x8, imm12]
2609+ unsigned Offset;
2610+ Update = findMatchingConstOffsetBackward (MBBI, LdStConstLimit, Offset);
2611+ if (Update != E && (Offset & (Scale - 1 )) == 0 ) {
2612+ // Merge the imm12 into the ld/st.
2613+ MBBI = mergeConstOffsetInsn (MBBI, Update, Offset, Scale);
2614+ return true ;
2615+ }
2616+
2617+ return false ;
2618+ }
2619+
24072620bool AArch64LoadStoreOpt::optimizeBlock (MachineBasicBlock &MBB,
24082621 bool EnableNarrowZeroStOpt) {
24092622
@@ -2482,6 +2695,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
24822695 ++MBBI;
24832696 }
24842697
2698+ // 5) Find a register assigned with a const value that can be combined with
2699+ // into the load or store. e.g.,
2700+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
2701+ // ldr x1, [x0, x8]
2702+ // ; becomes
2703+ // add x8, x0, a * (1<<12)
2704+ // ldr x1, [x8, imm12]
2705+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
2706+ MBBI != E;) {
2707+ int Scale;
2708+ if (isMergeableIndexLdSt (*MBBI, Scale) && tryToMergeIndexLdSt (MBBI, Scale))
2709+ Modified = true ;
2710+ else
2711+ ++MBBI;
2712+ }
2713+
24852714 return Modified;
24862715}
24872716
0 commit comments