# Changeset View

# Standalone View

# compiler/nativeGen/X86/CodeGen.hs

Show First 20 Lines • Show All 73 Lines • ▼ Show 20 Line(s) | |||||

74 | import Data.Bits | 74 | import Data.Bits | ||

75 | import Data.Foldable (fold) | 75 | import Data.Foldable (fold) | ||

76 | import Data.Int | 76 | import Data.Int | ||

77 | import Data.Maybe | 77 | import Data.Maybe | ||

78 | import Data.Word | 78 | import Data.Word | ||

79 | 79 | | |||

80 | import qualified Data.Map as M | 80 | import qualified Data.Map as M | ||

81 | 81 | | |||

82 | | ||||

carter: are these for Floats/Doubles or Int32/Word32/Int64/word64? | |||||

This is just a helper datatype that is used in Abhiroop: This is just a helper datatype that is used in `line 960` of this file to avoid rewriting the… | |||||

Now we know. But every new person looking at it will wonder the same thing. Add a short comment to explain this. I would also suggest moving it closer to where it is used. @carter what do you think? AndreasK: Now we know. But every new person looking at it will wonder the same thing.
Add a short… | |||||

82 | is32BitPlatform :: NatM Bool | 83 | is32BitPlatform :: NatM Bool | ||

83 | is32BitPlatform = do | 84 | is32BitPlatform = do | ||

84 | dflags <- getDynFlags | 85 | dflags <- getDynFlags | ||

85 | return $ target32Bit (targetPlatform dflags) | 86 | return $ target32Bit (targetPlatform dflags) | ||

86 | 87 | | |||

87 | sse2Enabled :: NatM Bool | 88 | sse2Enabled :: NatM Bool | ||

88 | sse2Enabled = do | 89 | sse2Enabled = do | ||

89 | dflags <- getDynFlags | 90 | dflags <- getDynFlags | ||

90 | return (isSse2Enabled dflags) | 91 | return (isSse2Enabled dflags) | ||

91 | 92 | | |||

93 | sse4_1Enabled :: NatM Bool | ||||

94 | sse4_1Enabled = do | ||||

95 | dflags <- getDynFlags | ||||

96 | return (isSse4_1Enabled dflags) | ||||

97 | | ||||

92 | sse4_2Enabled :: NatM Bool | 98 | sse4_2Enabled :: NatM Bool | ||

93 | sse4_2Enabled = do | 99 | sse4_2Enabled = do | ||

94 | dflags <- getDynFlags | 100 | dflags <- getDynFlags | ||

95 | return (isSse4_2Enabled dflags) | 101 | return (isSse4_2Enabled dflags) | ||

96 | 102 | | |||

103 | sseEnabled :: NatM Bool | ||||

104 | sseEnabled = do | ||||

105 | dflags <- getDynFlags | ||||

106 | return (isSseEnabled dflags) | ||||

107 | | ||||

108 | avxEnabled :: NatM Bool | ||||

109 | avxEnabled = do | ||||

110 | dflags <- getDynFlags | ||||

111 | return (isAvxEnabled dflags) | ||||

112 | | ||||

97 | if_sse2 :: NatM a -> NatM a -> NatM a | 113 | if_sse2 :: NatM a -> NatM a -> NatM a | ||

98 | if_sse2 sse2 x87 = do | 114 | if_sse2 sse2 x87 = do | ||

99 | b <- sse2Enabled | 115 | b <- sse2Enabled | ||

100 | if b then sse2 else x87 | 116 | if b then sse2 else x87 | ||

101 | 117 | | |||

102 | cmmTopCodeGen | 118 | cmmTopCodeGen | ||

103 | :: RawCmmDecl | 119 | :: RawCmmDecl | ||

104 | -> NatM [NatCmmDecl (Alignment, CmmStatics) Instr] | 120 | -> NatM [NatCmmDecl (Alignment, CmmStatics) Instr] | ||

▲ Show 20 Lines • Show All 85 Lines • ▼ Show 20 Line(s) | 205 | case foldMap to_unwind_entry regs of | |||

190 | tbl | M.null tbl -> return nilOL | 206 | tbl | M.null tbl -> return nilOL | ||

191 | | otherwise -> do | 207 | | otherwise -> do | ||

192 | lbl <- mkAsmTempLabel <$> getUniqueM | 208 | lbl <- mkAsmTempLabel <$> getUniqueM | ||

193 | return $ unitOL $ UNWIND lbl tbl | 209 | return $ unitOL $ UNWIND lbl tbl | ||

194 | 210 | | |||

195 | CmmAssign reg src | 211 | CmmAssign reg src | ||

196 | | isFloatType ty -> assignReg_FltCode format reg src | 212 | | isFloatType ty -> assignReg_FltCode format reg src | ||

197 | | is32Bit && isWord64 ty -> assignReg_I64Code reg src | 213 | | is32Bit && isWord64 ty -> assignReg_I64Code reg src | ||

214 | | isVecType ty -> assignReg_VecCode format reg src | ||||

198 | | otherwise -> assignReg_IntCode format reg src | 215 | | otherwise -> assignReg_IntCode format reg src | ||

199 | where ty = cmmRegType dflags reg | 216 | where ty = cmmRegType dflags reg | ||

200 | format = cmmTypeFormat ty | 217 | format = cmmTypeFormat ty | ||

201 | 218 | | |||

202 | CmmStore addr src | 219 | CmmStore addr src | ||

203 | | isFloatType ty -> assignMem_FltCode format addr src | 220 | | isFloatType ty -> assignMem_FltCode format addr src | ||

204 | | is32Bit && isWord64 ty -> assignMem_I64Code addr src | 221 | | is32Bit && isWord64 ty -> assignMem_I64Code addr src | ||

222 | | isVecType ty -> assignMem_VecCode format addr src | ||||

205 | | otherwise -> assignMem_IntCode format addr src | 223 | | otherwise -> assignMem_IntCode format addr src | ||

206 | where ty = cmmExprType dflags src | 224 | where ty = cmmExprType dflags src | ||

207 | format = cmmTypeFormat ty | 225 | format = cmmTypeFormat ty | ||

208 | 226 | | |||

209 | CmmUnsafeForeignCall target result_regs args | 227 | CmmUnsafeForeignCall target result_regs args | ||

210 | -> genCCall dflags is32Bit target result_regs args | 228 | -> genCCall dflags is32Bit target result_regs args | ||

211 | 229 | | |||

212 | CmmBranch id -> genBranch id | 230 | CmmBranch id -> genBranch id | ||

▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Line(s) | |||||

263 | -- to live in a pre-decided machine register, it comes out as @Fixed@; | 281 | -- to live in a pre-decided machine register, it comes out as @Fixed@; | ||

264 | -- otherwise, it comes out as @Any@, and the parent can decide which | 282 | -- otherwise, it comes out as @Any@, and the parent can decide which | ||

265 | -- register to put it in. | 283 | -- register to put it in. | ||

266 | -- | 284 | -- | ||

267 | data Register | 285 | data Register | ||

268 | = Fixed Format Reg InstrBlock | 286 | = Fixed Format Reg InstrBlock | ||

269 | | Any Format (Reg -> InstrBlock) | 287 | | Any Format (Reg -> InstrBlock) | ||

270 | 288 | | |||

271 | | ||||

272 | swizzleRegisterRep :: Register -> Format -> Register | 289 | swizzleRegisterRep :: Register -> Format -> Register | ||

273 | swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code | 290 | swizzleRegisterRep (Fixed _ reg code) format = Fixed format reg code | ||

274 | swizzleRegisterRep (Any _ codefn) format = Any format codefn | 291 | swizzleRegisterRep (Any _ codefn) format = Any format codefn | ||

275 | 292 | | |||

276 | 293 | | |||

277 | -- | Grab the Reg for a CmmReg | 294 | -- | Grab the Reg for a CmmReg | ||

278 | getRegisterReg :: Platform -> Bool -> CmmReg -> Reg | 295 | getRegisterReg :: Platform -> Bool -> CmmReg -> Reg | ||

279 | 296 | | |||

280 | getRegisterReg _ use_sse2 (CmmLocal (LocalReg u pk)) | 297 | getRegisterReg _ use_sse2 (CmmLocal (LocalReg u pk)) | ||

281 | = let fmt = cmmTypeFormat pk in | 298 | = let fmt = cmmTypeFormat pk in | ||

282 | if isFloatFormat fmt && not use_sse2 | 299 | if isFloatFormat fmt && not use_sse2 | ||

283 | then RegVirtual (mkVirtualReg u FF80) | 300 | then RegVirtual (mkVirtualReg u FF80) | ||

284 | else RegVirtual (mkVirtualReg u fmt) | 301 | else RegVirtual (mkVirtualReg u fmt) | ||

285 | 302 | | |||

286 | getRegisterReg platform _ (CmmGlobal mid) | 303 | getRegisterReg platform _ (CmmGlobal mid) | ||

287 | = case globalRegMaybe platform mid of | 304 | = case globalRegMaybe platform mid of | ||

288 | Just reg -> RegReal $ reg | 305 | Just reg -> RegReal $ reg | ||

289 | Nothing -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid) | 306 | Nothing -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid) | ||

290 | -- By this stage, the only MagicIds remaining should be the | 307 | -- By this stage, the only MagicIds remaining should be the | ||

291 | -- ones which map to a real machine register on this | 308 | -- ones which map to a real machine register on this | ||

292 | -- platform. Hence ... | 309 | -- platform. Hence ... | ||

293 | 310 | | |||

294 | 311 | | |||

carter: remember what we said about format? | |||||

312 | getVecRegisterReg :: Platform -> Bool -> Format -> CmmReg -> Reg | ||||

AndreasK: Can we move this functionality into `getRegisterReg`? | |||||

AndreasK: ^ | |||||

I have been avoiding doing this change because in the
I can obviously work around it sending a Keeping a separate I was thinking of creating a separate type: data MicroArch = MicroArch { sse :: Bool , sse2 :: Bool , avx :: Bool .... } and modifying the signature of
Abhiroop: I have been avoiding doing this change because in the `getRegisterReg` function… | |||||

There are only about 30 uses which concern us.
That sounds like a great idea! Do you see issues stopping us from just making Maybe we can even get away with just We often have the pattern for sse of: use_sse2 <- sse2Enabled .... getRegisterReg ... use_sse2 Then we could do somthing like below instead: sse_level <-sseSupport :: m MicroArch ... getRegisterReg pf sse_level reg But I also understand the desire/need to get this working before you think about clean code. AndreasK: > and modify getRegisterReg almost everywhere, because it is frequently used.
There are only… | |||||

I had initially thought of something almost along the exact same lines and then defining an Also making Abhiroop: > Maybe we can even get away with just MicroArch = SSE2 | SSE4 | SSE4_2 | ...?
I had initially… | |||||

I think a simple bgamari: I think a simple `MicroArch` product would already be a great improvement over the status quo. | |||||

313 | getVecRegisterReg _ use_avx format (CmmLocal (LocalReg u pk)) | ||||

314 | | isVecType pk && use_avx = RegVirtual (mkVirtualReg u format) | ||||

315 | | otherwise = pprPanic | ||||

316 | (unlines ["avx flag is not enabled" , | ||||

317 | "or this is not a vector register"]) | ||||

318 | (ppr pk) | ||||

319 | getVecRegisterReg platform use_avx _ c = getRegisterReg platform use_avx c | ||||

320 | | ||||

295 | -- | Memory addressing modes passed up the tree. | 321 | -- | Memory addressing modes passed up the tree. | ||

296 | data Amode | 322 | data Amode | ||

297 | = Amode AddrMode InstrBlock | 323 | = Amode AddrMode InstrBlock | ||

298 | 324 | | |||

299 | {- | 325 | {- | ||

300 | Now, given a tree (the argument to a CmmLoad) that references memory, | 326 | Now, given a tree (the argument to a CmmLoad) that references memory, | ||

301 | produce a suitable addressing mode. | 327 | produce a suitable addressing mode. | ||

302 | 328 | | |||

▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Line(s) | |||||

349 | getSomeReg expr = do | 375 | getSomeReg expr = do | ||

350 | r <- getRegister expr | 376 | r <- getRegister expr | ||

351 | case r of | 377 | case r of | ||

352 | Any rep code -> do | 378 | Any rep code -> do | ||

353 | tmp <- getNewRegNat rep | 379 | tmp <- getNewRegNat rep | ||

354 | return (tmp, code tmp) | 380 | return (tmp, code tmp) | ||

355 | Fixed _ reg code -> | 381 | Fixed _ reg code -> | ||

356 | return (reg, code) | 382 | return (reg, code) | ||

357 | 383 | | |||

carter: fix the format vs vec format stuff, plus implement this | |||||

358 | 384 | | |||

359 | assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock | 385 | assignMem_I64Code :: CmmExpr -> CmmExpr -> NatM InstrBlock | ||

360 | assignMem_I64Code addrTree valueTree = do | 386 | assignMem_I64Code addrTree valueTree = do | ||

361 | Amode addr addr_code <- getAmode addrTree | 387 | Amode addr addr_code <- getAmode addrTree | ||

362 | ChildCode64 vcode rlo <- iselExpr64 valueTree | 388 | ChildCode64 vcode rlo <- iselExpr64 valueTree | ||

363 | let | 389 | let | ||

364 | rhi = getHiVRegFromLo rlo | 390 | rhi = getHiVRegFromLo rlo | ||

365 | 391 | | |||

▲ Show 20 Lines • Show All 116 Lines • ▼ Show 20 Line(s) | 503 | ChildCode64 (code `snocOL` | |||

482 | r_dst_lo | 508 | r_dst_lo | ||

483 | ) | 509 | ) | ||

484 | 510 | | |||

485 | iselExpr64 expr | 511 | iselExpr64 expr | ||

486 | = pprPanic "iselExpr64(i386)" (ppr expr) | 512 | = pprPanic "iselExpr64(i386)" (ppr expr) | ||

487 | 513 | | |||

488 | 514 | | |||

489 | -------------------------------------------------------------------------------- | 515 | -------------------------------------------------------------------------------- | ||

516 | | ||||

517 | -- This is a helper data type which helps reduce the code duplication for | ||||

518 | -- the code generation of arithmetic operations. This is not specifically | ||||

519 | -- targetted for any particular type like Int8, Int32 etc | ||||

520 | data VectorArithInstns = VA_Add | VA_Sub | VA_Mul | VA_Div | ||||

bgamari: Let's make these names more descriptive. | |||||

521 | | ||||

522 | | ||||

490 | getRegister :: CmmExpr -> NatM Register | 523 | getRegister :: CmmExpr -> NatM Register | ||

491 | getRegister e = do dflags <- getDynFlags | 524 | getRegister e = do dflags <- getDynFlags | ||

492 | is32Bit <- is32BitPlatform | 525 | is32Bit <- is32BitPlatform | ||

493 | getRegister' dflags is32Bit e | 526 | getRegister' dflags is32Bit e | ||

494 | 527 | | |||

495 | getRegister' :: DynFlags -> Bool -> CmmExpr -> NatM Register | 528 | getRegister' :: DynFlags -> Bool -> CmmExpr -> NatM Register | ||

496 | 529 | | |||

497 | getRegister' dflags is32Bit (CmmReg reg) | 530 | getRegister' dflags is32Bit (CmmReg reg) | ||

498 | = case reg of | 531 | = case reg of | ||

499 | CmmGlobal PicBaseReg | 532 | CmmGlobal PicBaseReg | ||

500 | | is32Bit -> | 533 | | is32Bit -> | ||

501 | -- on x86_64, we have %rip for PicBaseReg, but it's not | 534 | -- on x86_64, we have %rip for PicBaseReg, but it's not | ||

502 | -- a full-featured register, it can only be used for | 535 | -- a full-featured register, it can only be used for | ||

503 | -- rip-relative addressing. | 536 | -- rip-relative addressing. | ||

504 | do reg' <- getPicBaseNat (archWordFormat is32Bit) | 537 | do reg' <- getPicBaseNat (archWordFormat is32Bit) | ||

505 | return (Fixed (archWordFormat is32Bit) reg' nilOL) | 538 | return (Fixed (archWordFormat is32Bit) reg' nilOL) | ||

506 | _ -> | 539 | _ -> | ||

507 | do use_sse2 <- sse2Enabled | 540 | do use_sse2 <- sse2Enabled | ||

508 | let | 541 | use_avx <- avxEnabled | ||

509 | fmt = cmmTypeFormat (cmmRegType dflags reg) | 542 | let cmmregtype = cmmRegType dflags reg | ||

510 | format | not use_sse2 && isFloatFormat fmt = FF80 | 543 | if isVecType cmmregtype | ||

511 | | otherwise = fmt | 544 | then return (vectorRegister cmmregtype use_avx use_sse2) | ||

512 | -- | 545 | else return (standardRegister cmmregtype use_avx use_sse2) | ||

513 | let platform = targetPlatform dflags | 546 | where | ||

514 | return (Fixed format | 547 | vectorRegister :: CmmType -> Bool -> Bool -> Register | ||

bgamari: These names are a bit terse and could perhaps use type annotations. | |||||

515 | (getRegisterReg platform use_sse2 reg) | 548 | vectorRegister reg_ty use_avx use_sse2 | ||

516 | nilOL) | 549 | | use_avx || use_sse2 = | ||

517 | 550 | let vecfmt = cmmTypeFormat reg_ty | |||

551 | platform = targetPlatform dflags | ||||

552 | in (Fixed vecfmt (getVecRegisterReg platform True vecfmt reg) nilOL) | ||||

553 | | otherwise = panic "Please enable the -mavx or -msse2 flag" | ||||

554 | | ||||

bgamari: Here too.
| |||||

555 | standardRegister :: CmmType -> Bool -> Bool -> Register | ||||

556 | standardRegister reg_ty use_avx use_sse2 = | ||||

557 | let fmt = cmmTypeFormat reg_ty | ||||

558 | format | not use_avx && not use_sse2 && isFloatFormat fmt | ||||

559 | = FF80 | ||||

560 | | otherwise | ||||

561 | = fmt | ||||

562 | platform = targetPlatform dflags | ||||

563 | in (Fixed format (getRegisterReg platform use_sse2 reg) nilOL) | ||||

518 | 564 | | |||

519 | getRegister' dflags is32Bit (CmmRegOff r n) | 565 | getRegister' dflags is32Bit (CmmRegOff r n) | ||

520 | = getRegister' dflags is32Bit $ mangleIndexTree dflags r n | 566 | = getRegister' dflags is32Bit $ mangleIndexTree dflags r n | ||

521 | 567 | | |||

522 | getRegister' dflags is32Bit (CmmMachOp (MO_AlignmentCheck align _) [e]) | 568 | getRegister' dflags is32Bit (CmmMachOp (MO_AlignmentCheck align _) [e]) | ||

523 | = addAlignmentCheck align <$> getRegister' dflags is32Bit e | 569 | = addAlignmentCheck align <$> getRegister' dflags is32Bit e | ||

524 | 570 | | |||

525 | -- for 32-bit architectuers, support some 64 -> 32 bit conversions: | 571 | -- for 32-bit architectuers, support some 64 -> 32 bit conversions: | ||

▲ Show 20 Lines • Show All 100 Lines • ▼ Show 20 Line(s) | 670 | | not is32Bit = do | |||

626 | return (Any II64 code) | 672 | return (Any II64 code) | ||

627 | 673 | | |||

628 | getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg), | 674 | getRegister' _ is32Bit (CmmMachOp (MO_Add W64) [CmmReg (CmmGlobal PicBaseReg), | ||

629 | CmmLit displacement]) | 675 | CmmLit displacement]) | ||

630 | | not is32Bit = do | 676 | | not is32Bit = do | ||

631 | return $ Any II64 (\dst -> unitOL $ | 677 | return $ Any II64 (\dst -> unitOL $ | ||

632 | LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst)) | 678 | LEA II64 (OpAddr (ripRel (litToImm displacement))) (OpReg dst)) | ||

633 | 679 | | |||

680 | getRegister' _ _ (CmmMachOp mop [x, y, z]) = do -- ternary MachOps | ||||

681 | sse4_1 <- sse4_1Enabled | ||||

682 | sse2 <- sse2Enabled | ||||

683 | sse <- sseEnabled | ||||

684 | case mop of | ||||

685 | MO_VF_Insert l W32 | sse4_1 && sse -> vector_float_pack l W32 x y z | ||||

686 | | otherwise | ||||

687 | -> sorry "Please enable the -msse4 and -msse flag" | ||||

688 | MO_VF_Insert l W64 | sse2 && sse -> vector_float_pack l W64 x y z | ||||

689 | | otherwise | ||||

690 | -> sorry "Please enable the -msse2 and -msse flag" | ||||

691 | _other -> incorrectOperands | ||||

692 | where | ||||

693 | vector_float_pack :: Length | ||||

694 | -> Width | ||||

695 | -> CmmExpr | ||||

696 | -> CmmExpr | ||||

697 | -> CmmExpr | ||||

698 | -> NatM Register | ||||

699 | vector_float_pack len W32 expr1 expr2 (CmmLit offset) | ||||

700 | = do | ||||

701 | fn <- getAnyReg expr1 | ||||

702 | (r, exp) <- getSomeReg expr2 | ||||

703 | let f = VecFormat len FmtFloat W32 | ||||

704 | imm = litToImm offset | ||||

705 | code dst = exp `appOL` | ||||

706 | (fn dst) `snocOL` | ||||

707 | (INSERTPS f (OpImm imm) (OpReg r) dst) | ||||

708 | in return $ Any f code | ||||

709 | vector_float_pack len W64 expr1 expr2 (CmmLit offset) | ||||

710 | = do | ||||

711 | Amode addr addr_code <- getAmode expr2 | ||||

712 | (r, exp) <- getSomeReg expr1 | ||||

713 | | ||||

714 | -- fn <- getAnyReg expr1 | ||||

715 | -- (r, exp) <- getSomeReg expr2 | ||||

716 | let f = VecFormat len FmtDouble W64 | ||||

717 | code dst | ||||

718 | = case offset of | ||||

719 | CmmInt 0 _ -> exp `appOL` addr_code `snocOL` | ||||

720 | (MOVL f (OpAddr addr) (OpReg r)) `snocOL` | ||||

721 | (MOVU f (OpReg r) (OpReg dst)) | ||||

722 | CmmInt 16 _ -> exp `appOL` addr_code `snocOL` | ||||

723 | (MOVH f (OpAddr addr) (OpReg r)) `snocOL` | ||||

724 | (MOVU f (OpReg r) (OpReg dst)) | ||||

725 | _ -> panic "Error in offset while packing" | ||||

Perhaps let's improve this panic a bit: "The offset passed to insert operations must be statically known." bgamari: Perhaps let's improve this panic a bit: "The offset passed to insert operations must be… | |||||

726 | -- code dst | ||||

727 | -- = case offset of | ||||

728 | -- CmmInt 0 _ -> exp `appOL` | ||||

729 | -- (fn dst) `snocOL` | ||||

730 | -- (MOVL f (OpReg r) (OpReg dst)) | ||||

731 | -- CmmInt 16 _ -> exp `appOL` | ||||

732 | -- (fn dst) `snocOL` | ||||

733 | -- (MOVH f (OpReg r) (OpReg dst)) | ||||

734 | -- _ -> panic "Error in offset while packing" | ||||

735 | in return $ Any f code | ||||

736 | vector_float_pack _ _ _ c _ | ||||

737 | = pprPanic "Pack not supported for : " (ppr c) | ||||

738 | | ||||

634 | getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps | 739 | getRegister' dflags is32Bit (CmmMachOp mop [x]) = do -- unary MachOps | ||

635 | sse2 <- sse2Enabled | 740 | sse2 <- sse2Enabled | ||

741 | sse <- sseEnabled | ||||

742 | avx <- avxEnabled | ||||

636 | case mop of | 743 | case mop of | ||

637 | MO_F_Neg w | 744 | MO_F_Neg w | ||

638 | | sse2 -> sse2NegCode w x | 745 | | sse2 -> sse2NegCode w x | ||

639 | | otherwise -> trivialUFCode FF80 (GNEG FF80) x | 746 | | otherwise -> trivialUFCode FF80 (GNEG FF80) x | ||

640 | 747 | | |||

641 | MO_S_Neg w -> triv_ucode NEGI (intFormat w) | 748 | MO_S_Neg w -> triv_ucode NEGI (intFormat w) | ||

642 | MO_Not w -> triv_ucode NOT (intFormat w) | 749 | MO_Not w -> triv_ucode NOT (intFormat w) | ||

643 | 750 | | |||

▲ Show 20 Lines • Show All 62 Lines • ▼ Show 20 Line(s) | 812 | MO_FF_Conv W32 W64 | |||

706 | | sse2 -> coerceFP2FP W64 x | 813 | | sse2 -> coerceFP2FP W64 x | ||

707 | | otherwise -> conversionNop FF80 x | 814 | | otherwise -> conversionNop FF80 x | ||

708 | 815 | | |||

709 | MO_FF_Conv W64 W32 -> coerceFP2FP W32 x | 816 | MO_FF_Conv W64 W32 -> coerceFP2FP W32 x | ||

710 | 817 | | |||

711 | MO_FS_Conv from to -> coerceFP2Int from to x | 818 | MO_FS_Conv from to -> coerceFP2Int from to x | ||

712 | MO_SF_Conv from to -> coerceInt2FP from to x | 819 | MO_SF_Conv from to -> coerceInt2FP from to x | ||

713 | 820 | | |||

714 | MO_V_Insert {} -> needLlvm | 821 | MO_V_Insert {} -> needLlvm | ||

715 | MO_V_Extract {} -> needLlvm | 822 | MO_V_Extract {} -> needLlvm | ||

716 | MO_V_Add {} -> needLlvm | 823 | MO_V_Add {} -> needLlvm | ||

717 | MO_V_Sub {} -> needLlvm | 824 | MO_V_Sub {} -> needLlvm | ||

718 | MO_V_Mul {} -> needLlvm | 825 | MO_V_Mul {} -> needLlvm | ||

719 | MO_VS_Quot {} -> needLlvm | 826 | MO_VS_Quot {} -> needLlvm | ||

720 | MO_VS_Rem {} -> needLlvm | 827 | MO_VS_Rem {} -> needLlvm | ||

721 | MO_VS_Neg {} -> needLlvm | 828 | MO_VS_Neg {} -> needLlvm | ||

722 | MO_VU_Quot {} -> needLlvm | 829 | MO_VU_Quot {} -> needLlvm | ||

723 | MO_VU_Rem {} -> needLlvm | 830 | MO_VU_Rem {} -> needLlvm | ||

724 | MO_VF_Insert {} -> needLlvm | 831 | MO_VF_Broadcast {} -> incorrectOperands | ||

725 | MO_VF_Extract {} -> needLlvm | 832 | MO_VF_Insert {} -> incorrectOperands | ||

726 | MO_VF_Add {} -> needLlvm | 833 | MO_VF_Extract {} -> incorrectOperands | ||

727 | MO_VF_Sub {} -> needLlvm | 834 | MO_VF_Add {} -> incorrectOperands | ||

Update the error message here. AndreasK: Update the error message here.
Add should now fail because it requires two operands, not… | |||||

728 | MO_VF_Mul {} -> needLlvm | 835 | MO_VF_Sub {} -> incorrectOperands | ||

729 | MO_VF_Quot {} -> needLlvm | 836 | MO_VF_Mul {} -> incorrectOperands | ||

730 | MO_VF_Neg {} -> needLlvm | 837 | MO_VF_Quot {} -> incorrectOperands | ||

838 | | ||||

839 | MO_VF_Neg l w | avx -> vector_float_negate_avx l w x | ||||

840 | | sse && sse2 -> vector_float_negate_sse l w x | ||||

841 | | otherwise | ||||

842 | -> sorry "Please enable the -mavx or -msse, -msse2 flag" | ||||

731 | 843 | | |||

732 | _other -> pprPanic "getRegister" (pprMachOp mop) | 844 | _other -> pprPanic "getRegister" (pprMachOp mop) | ||

733 | where | 845 | where | ||

734 | triv_ucode :: (Format -> Operand -> Instr) -> Format -> NatM Register | 846 | triv_ucode :: (Format -> Operand -> Instr) -> Format -> NatM Register | ||

735 | triv_ucode instr format = trivialUCode format (instr format) x | 847 | triv_ucode instr format = trivialUCode format (instr format) x | ||

736 | 848 | | |||

737 | -- signed or unsigned extension. | 849 | -- signed or unsigned extension. | ||

738 | integerExtend :: Width -> Width | 850 | integerExtend :: Width -> Width | ||

Show All 21 Lines | |||||

760 | 872 | | |||

761 | toI16Reg = toI8Reg -- for now | 873 | toI16Reg = toI8Reg -- for now | ||

762 | 874 | | |||

763 | conversionNop :: Format -> CmmExpr -> NatM Register | 875 | conversionNop :: Format -> CmmExpr -> NatM Register | ||

764 | conversionNop new_format expr | 876 | conversionNop new_format expr | ||

765 | = do e_code <- getRegister' dflags is32Bit expr | 877 | = do e_code <- getRegister' dflags is32Bit expr | ||

766 | return (swizzleRegisterRep e_code new_format) | 878 | return (swizzleRegisterRep e_code new_format) | ||

767 | 879 | | |||

880 | vector_float_negate_avx :: Length -> Width -> CmmExpr -> NatM Register | ||||

bgamari: We should probably rename this to include an `_avx` suffix. | |||||

881 | vector_float_negate_avx l w expr = do | ||||

882 | tmp <- getNewRegNat (VecFormat l FmtFloat w) | ||||

883 | (reg, exp) <- getSomeReg expr | ||||

884 | Amode addr addr_code <- memConstant (widthInBytes W32) (CmmFloat 0.0 W32) | ||||

Lint: Line Too Long: This line is 83 characters long, but the convention is 80 characters. | |||||

885 | let format = case w of | ||||

886 | W32 -> VecFormat l FmtFloat w | ||||

887 | W64 -> VecFormat l FmtDouble w | ||||

888 | _ -> pprPanic "Cannot negate vector of width" (ppr w) | ||||

889 | code dst = case w of | ||||

890 | W32 -> exp `appOL` addr_code `snocOL` | ||||

891 | (VBROADCAST format addr tmp) `snocOL` | ||||

892 | (VSUB format (OpReg reg) tmp dst) | ||||

893 | W64 -> exp `appOL` addr_code `snocOL` | ||||

894 | (MOVL format (OpAddr addr) (OpReg tmp)) `snocOL` | ||||

Lint: Line Too Long: This line is 82 characters long, but the convention is 80 characters. | |||||

895 | (MOVH format (OpAddr addr) (OpReg tmp)) `snocOL` | ||||

Lint: Line Too Long: This line is 82 characters long, but the convention is 80 characters. | |||||

896 | (VSUB format (OpReg reg) tmp dst) | ||||

897 | _ -> pprPanic "Cannot negate vector of width" (ppr w) | ||||

898 | return (Any format code) | ||||

899 | | ||||

900 | vector_float_negate_sse :: Length -> Width -> CmmExpr -> NatM Register | ||||

901 | vector_float_negate_sse l w expr = do | ||||

902 | tmp <- getNewRegNat (VecFormat l FmtFloat w) | ||||

903 | (reg, exp) <- getSomeReg expr | ||||

904 | let format = case w of | ||||

905 | W32 -> VecFormat l FmtFloat w | ||||

906 | W64 -> VecFormat l FmtDouble w | ||||

907 | _ -> pprPanic "Cannot negate vector of width" (ppr w) | ||||

908 | code dst = exp `snocOL` | ||||

909 | (XOR format (OpReg tmp) (OpReg tmp)) `snocOL` | ||||

910 | (MOVU format (OpReg tmp) (OpReg dst)) `snocOL` | ||||

911 | (SUB format (OpReg reg) (OpReg dst)) | ||||

912 | return (Any format code) | ||||

768 | 913 | | |||

769 | getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps | 914 | getRegister' _ is32Bit (CmmMachOp mop [x, y]) = do -- dyadic MachOps | ||

770 | sse2 <- sse2Enabled | 915 | sse4_1 <- sse4_1Enabled | ||

916 | sse2 <- sse2Enabled | ||||

917 | sse <- sseEnabled | ||||

918 | avx <- avxEnabled | ||||

771 | case mop of | 919 | case mop of | ||

772 | MO_F_Eq _ -> condFltReg is32Bit EQQ x y | 920 | MO_F_Eq _ -> condFltReg is32Bit EQQ x y | ||

773 | MO_F_Ne _ -> condFltReg is32Bit NE x y | 921 | MO_F_Ne _ -> condFltReg is32Bit NE x y | ||

774 | MO_F_Gt _ -> condFltReg is32Bit GTT x y | 922 | MO_F_Gt _ -> condFltReg is32Bit GTT x y | ||

775 | MO_F_Ge _ -> condFltReg is32Bit GE x y | 923 | MO_F_Ge _ -> condFltReg is32Bit GE x y | ||

776 | -- Invert comparison condition and swap operands | 924 | -- Invert comparison condition and swap operands | ||

777 | -- See Note [SSE Parity Checks] | 925 | -- See Note [SSE Parity Checks] | ||

778 | MO_F_Lt _ -> condFltReg is32Bit GTT y x | 926 | MO_F_Lt _ -> condFltReg is32Bit GTT y x | ||

Show All 31 Lines | |||||

810 | 958 | | |||

811 | MO_S_MulMayOflo rep -> imulMayOflo rep x y | 959 | MO_S_MulMayOflo rep -> imulMayOflo rep x y | ||

812 | 960 | | |||

813 | MO_Mul W8 -> imulW8 x y | 961 | MO_Mul W8 -> imulW8 x y | ||

814 | MO_Mul rep -> triv_op rep IMUL | 962 | MO_Mul rep -> triv_op rep IMUL | ||

815 | MO_And rep -> triv_op rep AND | 963 | MO_And rep -> triv_op rep AND | ||

816 | MO_Or rep -> triv_op rep OR | 964 | MO_Or rep -> triv_op rep OR | ||

817 | MO_Xor rep -> triv_op rep XOR | 965 | MO_Xor rep -> triv_op rep XOR | ||

818 | 966 | ----------------- | |||

819 | {- Shift ops on x86s have constraints on their source, it | 967 | {- Shift ops on x86s have constraints on their source, it | ||

820 | either has to be Imm, CL or 1 | 968 | either has to be Imm, CL or 1 | ||

821 | => trivialCode is not restrictive enough (sigh.) | 969 | => trivialCode is not restrictive enough (sigh.) | ||

822 | -} | 970 | -} | ||

823 | MO_Shl rep -> shift_code rep SHL x y {-False-} | 971 | MO_Shl rep -> shift_code rep SHL x y {-False-} | ||

824 | MO_U_Shr rep -> shift_code rep SHR x y {-False-} | 972 | MO_U_Shr rep -> shift_code rep SHR x y {-False-} | ||

825 | MO_S_Shr rep -> shift_code rep SAR x y {-False-} | 973 | MO_S_Shr rep -> shift_code rep SAR x y {-False-} | ||

826 | 974 | | |||

827 | MO_V_Insert {} -> needLlvm | 975 | MO_V_Insert {} -> needLlvm | ||

828 | MO_V_Extract {} -> needLlvm | 976 | MO_V_Extract {} -> needLlvm | ||

829 | MO_V_Add {} -> needLlvm | 977 | MO_V_Add {} -> needLlvm | ||

830 | MO_V_Sub {} -> needLlvm | 978 | MO_V_Sub {} -> needLlvm | ||

831 | MO_V_Mul {} -> needLlvm | 979 | MO_V_Mul {} -> needLlvm | ||

832 | MO_VS_Quot {} -> needLlvm | 980 | MO_VS_Quot {} -> needLlvm | ||

833 | MO_VS_Rem {} -> needLlvm | 981 | MO_VS_Rem {} -> needLlvm | ||

834 | MO_VS_Neg {} -> needLlvm | 982 | MO_VS_Neg {} -> needLlvm | ||

835 | MO_VF_Insert {} -> needLlvm | 983 | | ||

836 | MO_VF_Extract {} -> needLlvm | 984 | MO_VF_Broadcast l W32 | avx -> vector_float_broadcast_avx l W32 x y | ||

bgamari: `_avx` suffix here. | |||||

837 | MO_VF_Add {} -> needLlvm | 985 | | sse4_1 -> vector_float_broadcast_sse l W32 x y | ||

838 | MO_VF_Sub {} -> needLlvm | 986 | | otherwise | ||

839 | MO_VF_Mul {} -> needLlvm | 987 | -> sorry "Please enable the -mavx or -msse4 flag" | ||

840 | MO_VF_Quot {} -> needLlvm | 988 | | ||

841 | MO_VF_Neg {} -> needLlvm | 989 | MO_VF_Broadcast l W64 | sse2 -> vector_float_broadcast_avx l W64 x y | ||

990 | | otherwise -> sorry "Please enable the -msse2 flag" | ||||

991 | | ||||

992 | MO_VF_Extract l W32 | avx -> vector_float_unpack l W32 x y | ||||

993 | | sse -> vector_float_unpack_sse l W32 x y | ||||

994 | | otherwise | ||||

995 | -> sorry "Please enable the -mavx or -msse flag" | ||||

996 | | ||||

997 | MO_VF_Extract l W64 | sse2 -> vector_float_unpack l W64 x y | ||||

998 | | otherwise -> sorry "Please enable the -msse2 flag" | ||||

999 | | ||||

1000 | MO_VF_Add l w | avx -> vector_float_op_avx VA_Add l w x y | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1001 | | sse && w == W32 -> vector_float_op_sse VA_Add l w x y | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1002 | | sse2 && w == W64 -> vector_float_op_sse VA_Add l w x y | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1003 | | otherwise | ||||

1004 | -> sorry "Please enable the -mavx or -msse flag" | ||||

1005 | | ||||

1006 | MO_VF_Sub l w | avx -> vector_float_op_avx VA_Sub l w x y | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1007 | | sse && w == W32 -> vector_float_op_sse VA_Sub l w x y | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1008 | | sse2 && w == W64 -> vector_float_op_sse VA_Sub l w x y | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1009 | | otherwise | ||||

1010 | -> sorry "Please enable the -mavx or -msse flag" | ||||

1011 | | ||||

1012 | MO_VF_Mul l w | avx -> vector_float_op_avx VA_Mul l w x y | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1013 | | sse && w == W32 -> vector_float_op_sse VA_Mul l w x y | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1014 | | sse2 && w == W64 -> vector_float_op_sse VA_Mul l w x y | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1015 | | otherwise | ||||

1016 | -> sorry "Please enable the -mavx or -msse flag" | ||||

1017 | | ||||

1018 | MO_VF_Quot l w | avx -> vector_float_op_avx VA_Div l w x y | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1019 | | sse && w == W32 -> vector_float_op_sse VA_Div l w x y | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1020 | | sse2 && w == W64 -> vector_float_op_sse VA_Div l w x y | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1021 | | otherwise | ||||

1022 | -> sorry "Please enable the -mavx or -msse flag" | ||||

1023 | | ||||

1024 | MO_VF_Insert {} -> incorrectOperands | ||||

1025 | MO_VF_Neg {} -> incorrectOperands | ||||

negate should be easy ... dont we juxt do a floating point bitwise XOR with a mask thats 1 on the sign bit and zero everwhere else? except ... one issue we're gonna need to think about is how all these operations behave for NAN! (i suppose i can help with that after we get everything else working) carter: negate should be easy ... dont we juxt do a floating point bitwise XOR with a mask thats 1 on… | |||||

842 | 1026 | | |||

843 | _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop) | 1027 | _other -> pprPanic "getRegister(x86) - binary CmmMachOp (1)" (pprMachOp mop) | ||

844 | where | 1028 | where | ||

845 | -------------------- | 1029 | -------------------- | ||

846 | triv_op width instr = trivialCode width op (Just op) x y | 1030 | triv_op width instr = trivialCode width op (Just op) x y | ||

847 | where op = instr (intFormat width) | 1031 | where op = instr (intFormat width) | ||

848 | 1032 | | |||

849 | -- Special case for IMUL for bytes, since the result of IMULB will be in | 1033 | -- Special case for IMUL for bytes, since the result of IMULB will be in | ||

▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Line(s) | |||||

929 | add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register | 1113 | add_code :: Width -> CmmExpr -> CmmExpr -> NatM Register | ||

930 | add_code rep x (CmmLit (CmmInt y _)) | 1114 | add_code rep x (CmmLit (CmmInt y _)) | ||

931 | | is32BitInteger y = add_int rep x y | 1115 | | is32BitInteger y = add_int rep x y | ||

932 | add_code rep x y = trivialCode rep (ADD format) (Just (ADD format)) x y | 1116 | add_code rep x y = trivialCode rep (ADD format) (Just (ADD format)) x y | ||

933 | where format = intFormat rep | 1117 | where format = intFormat rep | ||

934 | -- TODO: There are other interesting patterns we want to replace | 1118 | -- TODO: There are other interesting patterns we want to replace | ||

935 | -- with a LEA, e.g. `(x + offset) + (y << shift)`. | 1119 | -- with a LEA, e.g. `(x + offset) + (y << shift)`. | ||

936 | 1120 | | |||

1121 | ----------------------- | ||||

1122 | -- Vector operations--- | ||||

1123 | vector_float_op_avx :: VectorArithInstns | ||||

1124 | -> Length | ||||

1125 | -> Width | ||||

1126 | -> CmmExpr | ||||

1127 | -> CmmExpr | ||||

1128 | -> NatM Register | ||||

1129 | vector_float_op_avx op l w expr1 expr2 = do | ||||

1130 | (reg1, exp1) <- getSomeReg expr1 | ||||

1131 | (reg2, exp2) <- getSomeReg expr2 | ||||

1132 | let format = case w of | ||||

AndreasK: We should be able to handle this case with a single VMOVUPS. | |||||

1133 | W32 -> VecFormat l FmtFloat W32 | ||||

1134 | W64 -> VecFormat l FmtDouble W64 | ||||

1135 | _ -> pprPanic "Operation not supported for width " (ppr w) | ||||

Lint: Line Too Long: This line is 81 characters long, but the convention is 80 characters. | |||||

1136 | code dst = case op of | ||||

1137 | VA_Add -> arithInstr VADD | ||||

Assuming the pattern match is exhaustive why not make this a function that just fills in the instruction? AndreasK: Assuming the pattern match is exhaustive why not make this a function that just fills in the… | |||||

I am not sure I follow what you meant here. Do you mean instead of using a separate datatype having Abhiroop: I am not sure I follow what you meant here.
Do you mean instead of using a separate datatype… | |||||

I was thinking along the lines of that: code dst = case op of -- opcode src2 src1 dst <==> dst = src1 `opcode` src2 VADD -> arithInstr VADDPS VSUB -> arithInstr VSUBPS .... where arithInstr instr = unitOL (instr format (OpReg reg2) reg1 dst) AndreasK: I was thinking along the lines of that:
```
code dst = case op of… | |||||

1138 | VA_Sub -> arithInstr VSUB | ||||

1139 | VA_Mul -> arithInstr VMUL | ||||

1140 | VA_Div -> arithInstr VDIV | ||||

1141 | where | ||||

1142 | -- opcode src2 src1 dst <==> dst = src1 `opcode` src2 | ||||

1143 | arithInstr instr = exp1 `appOL` exp2 `snocOL` | ||||

1144 | (instr format (OpReg reg2) reg1 dst) | ||||

1145 | return (Any format code) | ||||

1146 | | ||||

1147 | vector_float_op_sse :: VectorArithInstns | ||||

1148 | -> Length | ||||

1149 | -> Width | ||||

1150 | -> CmmExpr | ||||

1151 | -> CmmExpr | ||||

1152 | -> NatM Register | ||||

1153 | vector_float_op_sse op l w expr1 expr2 = do | ||||

1154 | (reg1, exp1) <- getSomeReg expr1 | ||||

1155 | (reg2, exp2) <- getSomeReg expr2 | ||||

1156 | let format = case w of | ||||

1157 | W32 -> VecFormat l FmtFloat W32 | ||||

1158 | W64 -> VecFormat l FmtDouble W64 | ||||

1159 | _ -> pprPanic "Operation not supported for width " (ppr w) | ||||

Lint: Line Too Long: This line is 81 characters long, but the convention is 80 characters. | |||||

1160 | code dst = case op of | ||||

1161 | VA_Add -> arithInstr ADD | ||||

1162 | VA_Sub -> arithInstr SUB | ||||

1163 | VA_Mul -> arithInstr MUL | ||||

1164 | VA_Div -> arithInstr FDIV | ||||

1165 | where | ||||

1166 | -- opcode src2 src1 <==> src1 = src1 `opcode` src2 | ||||

1167 | arithInstr instr | ||||

1168 | = exp1 `appOL` exp2 `snocOL` | ||||

1169 | (MOVU format (OpReg reg1) (OpReg dst)) `snocOL` | ||||

1170 | (instr format (OpReg reg2) (OpReg dst)) | ||||

1171 | return (Any format code) | ||||

937 | -------------------- | 1172 | -------------------- | ||

1173 | vector_float_unpack :: Length | ||||

1174 | -> Width | ||||

1175 | -> CmmExpr | ||||

1176 | -> CmmExpr | ||||

1177 | -> NatM Register | ||||

1178 | vector_float_unpack l W32 expr (CmmLit lit) | ||||

1179 | = do | ||||

1180 | (r, exp) <- getSomeReg expr | ||||

1181 | let format = VecFormat l FmtFloat W32 | ||||

1182 | imm = litToImm lit | ||||

1183 | code dst | ||||

1184 | = case lit of | ||||

1185 | CmmInt 0 _ -> exp `snocOL` (VMOVU format (OpReg r) (OpReg dst)) | ||||

1186 | CmmInt _ _ -> exp `snocOL` (VPSHUFD format (OpImm imm) (OpReg r) dst) | ||||

Lint: Line Too Long: This line is 85 characters long, but the convention is 80 characters. | |||||

1187 | _ -> panic "Error in offset while unpacking" | ||||

1188 | return (Any format code) | ||||

The AVX version is probably faster (but verify that!). If so we should generate different code depending on what the cpu supports. Also why is it VMOVUP AndreasK: `VPSHUFD` requires AVX. Imo it is overly restrictive to require avx to unpack floats.
The AVX… | |||||

we are gonna have both SSE and AVX encodings of most things, the current work is simplifying in the following ways - only handling 4xFloat# vectors
- first getting the AVX encoding working. (i think ideal end state is to have which instruction subset toggled by march style flags)
later we should figure out how to have per function definition "march / ISA flag" pragmas, so end users can toggle per definition (though per module wouldn't be that bad I guess?) carter: we are gonna have both SSE and AVX encodings of most things, the current work is simplifying in… | |||||

@AndreasK I want this to be dependent on the Abhiroop: @AndreasK **PS** in VMOVUPS refers to **single precision** float and the **D** in VPSHUFD… | |||||

I guess that makes sense as it can also be used for ints.
As far as I know for SIMD instructions the PS postfix usually indicates packed single-precision while SS stands for scalar single-precision. It's pretty clear that this holds up for VBROADCAST too if you look at the Intel instruction reference. - SS -> Broadcast the low single-precision floating-point element in the source operand to four locations in xmm1.
- SD -> ... double precision ...
- F128 -> Broadcast 128 bits of floating-point data ...
As soon as it isn't a scalar being broadcast the first S is dropped. AndreasK: > VPSHUFD refers to double word (or 32 bits).
I guess that makes sense as it can also be used… | |||||

1189 | vector_float_unpack l W64 expr (CmmLit lit) | ||||

1190 | = do | ||||

1191 | dflags <- getDynFlags | ||||

1192 | (r, exp) <- getSomeReg expr | ||||

1193 | let format = VecFormat l FmtDouble W64 | ||||

So I did some reading. Assuming the destination register is an xmm register: - If the offset is zero then we indeed should be able to just use VMOV as all scalar operations only use the lowest scalar value in the register.
See also the Intel Manual:
- If the index is NOT zero
We can shuffle the register so that we move the given index to the lowest position. That way we can use just a single VEXTRACTPS for general registers, and don't have to touch the memory for xmm destinations. AndreasK: So I did some reading.
Assuming the destination register is an xmm register:
* If the offset… | |||||

wait wait wait ... why do we have "(CmmLit lit)" whats this argument for? This seems wrong (i dont think we have anything preventing literals from being in a register or something ... ), and even once we add / work out support for compile time constants, they won't / shouldn't be normal CMMExpressions in the CMM layer, they should be fields in the primop constructor! carter: wait wait wait ...
why do we have "(CmmLit lit)"
whats this argument for? This seems wrong (i… | |||||

@carter when you mentioned that this should be a part of a prim-op constructor do note that in this operation I am using Maybe I didn't get what you meant, but I think your comment was just for the shuffle operations where we should have something on the lines of
If you are asking about the utility of Abhiroop: > whats this argument for?
@carter when you mentioned that this should be a part of a prim-op… | |||||

1194 | addr = spRel dflags 0 | ||||

It sounds like we could do this without spilling the value out to memory. However, let's just add a TODO to avoid holding up the patch. bgamari: It sounds like we could do this without spilling the value out to memory. However, let's just… | |||||

1195 | code dst | ||||

1196 | = case lit of | ||||

1197 | CmmInt 0 _ -> exp `snocOL` | ||||

1198 | (MOVL format (OpReg r) (OpAddr addr)) `snocOL` | ||||

1199 | (MOV FF64 (OpAddr addr) (OpReg dst)) | ||||

1200 | CmmInt 1 _ -> exp `snocOL` | ||||

1201 | (MOVH format (OpReg r) (OpAddr addr)) `snocOL` | ||||

1202 | (MOV FF64 (OpAddr addr) (OpReg dst)) | ||||

1203 | _ -> panic "Error in offset while unpacking" | ||||

1204 | return (Any format code) | ||||

1205 | vector_float_unpack _ w c e | ||||

1206 | = pprPanic "Unpack not supported for : " (ppr c $$ ppr e $$ ppr w) | ||||

1207 | ----------------------- | ||||

1208 | | ||||

1209 | vector_float_unpack_sse :: Length | ||||

1210 | -> Width | ||||

1211 | -> CmmExpr | ||||

1212 | -> CmmExpr | ||||

1213 | -> NatM Register | ||||

1214 | vector_float_unpack_sse l W32 expr (CmmLit lit) | ||||

1215 | = do | ||||

1216 | (r,exp) <- getSomeReg expr | ||||

1217 | let format = VecFormat l FmtFloat W32 | ||||

1218 | imm = litToImm lit | ||||

1219 | code dst | ||||

1220 | = case lit of | ||||

1221 | CmmInt 0 _ -> exp `snocOL` (MOVU format (OpReg r) (OpReg dst)) | ||||

1222 | CmmInt _ _ -> exp `snocOL` (PSHUFD format (OpImm imm) (OpReg r) dst) | ||||

Lint: Line Too Long: This line is 84 characters long, but the convention is 80 characters. | |||||

1223 | _ -> panic "Error in offset while unpacking" | ||||

1224 | return (Any format code) | ||||

1225 | vector_float_unpack_sse _ w c e | ||||

1226 | = pprPanic "Unpack not supported for : " (ppr c $$ ppr e $$ ppr w) | ||||

1227 | ----------------------- | ||||

1228 | vector_float_broadcast_avx :: Length | ||||

1229 | -> Width | ||||

1230 | -> CmmExpr | ||||

1231 | -> CmmExpr | ||||

1232 | -> NatM Register | ||||

1233 | vector_float_broadcast_avx len W32 expr1 expr2 | ||||

1234 | = do | ||||

1235 | dflags <- getDynFlags | ||||

1236 | fn <- getAnyReg expr1 | ||||

1237 | (r', exp) <- getSomeReg expr2 | ||||

1238 | let f = VecFormat len FmtFloat W32 | ||||

1239 | addr = spRel dflags 0 | ||||

1240 | in return $ Any f (\r -> exp `appOL` | ||||

1241 | (fn r) `snocOL` | ||||

1242 | (MOVU f (OpReg r') (OpAddr addr)) `snocOL` | ||||

1243 | (VBROADCAST f addr r)) | ||||

1244 | vector_float_broadcast_avx len W64 expr1 expr2 | ||||

1245 | = do | ||||

1246 | dflags <- getDynFlags | ||||

1247 | fn <- getAnyReg expr1 | ||||

1248 | (r', exp) <- getSomeReg expr2 | ||||

1249 | let f = VecFormat len FmtDouble W64 | ||||

1250 | addr = spRel dflags 0 | ||||

1251 | in return $ Any f (\r -> exp `appOL` | ||||

1252 | (fn r) `snocOL` | ||||

1253 | (MOVU f (OpReg r') (OpAddr addr)) `snocOL` | ||||

1254 | (MOVL f (OpAddr addr) (OpReg r)) `snocOL` | ||||

1255 | (MOVH f (OpAddr addr) (OpReg r))) | ||||

1256 | vector_float_broadcast_avx _ _ c _ | ||||

1257 | = pprPanic "Broadcast not supported for : " (ppr c) | ||||

1258 | ----------------------- | ||||

1259 | vector_float_broadcast_sse :: Length | ||||

1260 | -> Width | ||||

1261 | -> CmmExpr | ||||

1262 | -> CmmExpr | ||||

1263 | -> NatM Register | ||||

1264 | vector_float_broadcast_sse len W32 expr1 expr2 | ||||

1265 | = do | ||||

1266 | dflags <- getDynFlags | ||||

1267 | fn <- getAnyReg expr1 -- destination | ||||

1268 | (r, exp) <- getSomeReg expr2 -- source | ||||

1269 | let f = VecFormat len FmtFloat W32 | ||||

1270 | addr = spRel dflags 0 | ||||

1271 | code dst = exp `appOL` | ||||

1272 | (fn dst) `snocOL` | ||||

1273 | (MOVU f (OpReg r) (OpAddr addr)) `snocOL` | ||||

1274 | (insertps 0) `snocOL` | ||||

bgamari: Let's factor out this pattern instead of duplicating the expression. | |||||

1275 | (insertps 16) `snocOL` | ||||

1276 | (insertps 32) `snocOL` | ||||

1277 | (insertps 48) | ||||

1278 | where | ||||

1279 | insertps off = | ||||

1280 | INSERTPS f (OpImm $ litToImm $ CmmInt off W32) (OpAddr addr) dst | ||||

1281 | | ||||

1282 | in return $ Any f code | ||||

1283 | vector_float_broadcast_sse _ _ c _ | ||||

1284 | = pprPanic "Broadcast not supported for : " (ppr c) | ||||

1285 | ----------------------- | ||||

938 | sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register | 1286 | sub_code :: Width -> CmmExpr -> CmmExpr -> NatM Register | ||

939 | sub_code rep x (CmmLit (CmmInt y _)) | 1287 | sub_code rep x (CmmLit (CmmInt y _)) | ||

940 | | is32BitInteger (-y) = add_int rep x (-y) | 1288 | | is32BitInteger (-y) = add_int rep x (-y) | ||

941 | sub_code rep x y = trivialCode rep (SUB (intFormat rep)) Nothing x y | 1289 | sub_code rep x y = trivialCode rep (SUB (intFormat rep)) Nothing x y | ||

942 | 1290 | | |||

943 | -- our three-operand add instruction: | 1291 | -- our three-operand add instruction: | ||

944 | add_int width x y = do | 1292 | add_int width x y = do | ||

945 | (x_reg, x_code) <- getSomeReg x | 1293 | (x_reg, x_code) <- getSomeReg x | ||

Show All 36 Lines | 1329 | code = y_code `appOL` | |||

982 | x_code eax `appOL` | 1330 | x_code eax `appOL` | ||

983 | toOL [widen, instr format y_op] | 1331 | toOL [widen, instr format y_op] | ||

984 | 1332 | | |||

985 | result | quotient = eax | 1333 | result | quotient = eax | ||

986 | | otherwise = edx | 1334 | | otherwise = edx | ||

987 | 1335 | | |||

988 | return (Fixed format result code) | 1336 | return (Fixed format result code) | ||

989 | 1337 | | |||

1338 | getRegister' _ _ (CmmLoad mem pk) | ||||

1339 | | isVecType pk = do | ||||

1340 | use_avx <- avxEnabled | ||||

1341 | use_sse <- sseEnabled | ||||

1342 | Amode addr mem_code <- getAmode mem | ||||

1343 | let format = cmmTypeFormat pk | ||||

1344 | code dst | ||||

1345 | | use_avx = mem_code `snocOL` | ||||

1346 | VMOVU format (OpAddr addr) (OpReg dst) | ||||

1347 | | use_sse = mem_code `snocOL` | ||||

1348 | MOVU format (OpAddr addr) (OpReg dst) | ||||

1349 | | otherwise = pprPanic (unlines ["avx or sse flag not enabled", | ||||

1350 | "for loading to "]) | ||||

1351 | (ppr pk) | ||||

1352 | return (Any format code) | ||||

990 | 1353 | | |||

991 | getRegister' _ _ (CmmLoad mem pk) | 1354 | getRegister' _ _ (CmmLoad mem pk) | ||

992 | | isFloatType pk | 1355 | | isFloatType pk | ||

993 | = do | 1356 | = do | ||

994 | Amode addr mem_code <- getAmode mem | 1357 | Amode addr mem_code <- getAmode mem | ||

995 | use_sse2 <- sse2Enabled | 1358 | use_sse2 <- sse2Enabled | ||

996 | loadFloatAmode use_sse2 (typeWidth pk) addr mem_code | 1359 | loadFloatAmode use_sse2 (typeWidth pk) addr mem_code | ||

997 | 1360 | | |||

▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Line(s) | 1403 | getRegister' dflags is32Bit (CmmLit lit) | |||

1049 | isBigLit _ = False | 1412 | isBigLit _ = False | ||

1050 | -- note1: not the same as (not.is32BitLit), because that checks for | 1413 | -- note1: not the same as (not.is32BitLit), because that checks for | ||

1051 | -- signed literals that fit in 32 bits, but we want unsigned | 1414 | -- signed literals that fit in 32 bits, but we want unsigned | ||

1052 | -- literals here. | 1415 | -- literals here. | ||

1053 | -- note2: all labels are small, because we're assuming the | 1416 | -- note2: all labels are small, because we're assuming the | ||

1054 | -- small memory model (see gcc docs, -mcmodel=small). | 1417 | -- small memory model (see gcc docs, -mcmodel=small). | ||

1055 | 1418 | | |||

1056 | getRegister' dflags _ (CmmLit lit) | 1419 | getRegister' dflags _ (CmmLit lit) | ||

1057 | = do let format = cmmTypeFormat (cmmLitType dflags lit) | 1420 | = do let cmmtype = cmmLitType dflags lit | ||

1058 | imm = litToImm lit | 1421 | if isVecType cmmtype | ||

1059 | code dst = unitOL (MOV format (OpImm imm) (OpReg dst)) | 1422 | then (vectorRegister cmmtype) | ||

1060 | return (Any format code) | 1423 | else (standardRegister cmmtype) | ||

1424 | where | ||||

1425 | vectorRegister ctype | ||||

1426 | = do | ||||

1427 | --NOTE: | ||||

1428 | -- This operation is only used to zero a register. For loading a | ||||

1429 | -- vector literal there are pack and broadcast operations | ||||

1430 | let format = cmmTypeFormat ctype | ||||

1431 | code dst = unitOL (XOR format (OpReg dst) (OpReg dst)) | ||||

1432 | return (Any format code) | ||||

1433 | standardRegister ctype | ||||

1434 | = do | ||||

1435 | let format = cmmTypeFormat ctype | ||||

1436 | imm = litToImm lit | ||||

1437 | code dst = unitOL (MOV format (OpImm imm) (OpReg dst)) | ||||

1438 | return (Any format code) | ||||

1061 | 1439 | | |||

1062 | getRegister' _ _ other | 1440 | getRegister' _ _ other | ||

1063 | | isVecExpr other = needLlvm | 1441 | | isVecExpr other = needLlvm | ||

1064 | | otherwise = pprPanic "getRegister(x86)" (ppr other) | 1442 | | otherwise = pprPanic "getRegister(x86)" (ppr other) | ||

1065 | 1443 | | |||

1066 | 1444 | | |||

1067 | intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr | 1445 | intLoadCode :: (Operand -> Operand -> Instr) -> CmmExpr | ||

1068 | -> NatM (Reg -> InstrBlock) | 1446 | -> NatM (Reg -> InstrBlock) | ||

1069 | intLoadCode instr mem = do | 1447 | intLoadCode instr mem = do | ||

1070 | Amode src mem_code <- getAmode mem | 1448 | Amode src mem_code <- getAmode mem | ||

1071 | return (\dst -> mem_code `snocOL` instr (OpAddr src) (OpReg dst)) | 1449 | return (\dst -> mem_code `snocOL` instr (OpAddr src) (OpReg dst)) | ||

1072 | 1450 | | |||

1073 | -- Compute an expression into *any* register, adding the appropriate | 1451 | -- Compute an expression into *any* register, adding the appropriate | ||

1074 | -- move instruction if necessary. | 1452 | -- move instruction if necessary. | ||

1075 | getAnyReg :: CmmExpr -> NatM (Reg -> InstrBlock) | 1453 | getAnyReg :: CmmExpr -> NatM (Reg -> InstrBlock) | ||

1076 | getAnyReg expr = do | 1454 | getAnyReg expr = do | ||

1077 | r <- getRegister expr | 1455 | r <- getRegister expr | ||

1078 | anyReg r | 1456 | anyReg r | ||

1079 | 1457 | | |||

1080 | anyReg :: Register -> NatM (Reg -> InstrBlock) | 1458 | anyReg :: Register -> NatM (Reg -> InstrBlock) | ||

1081 | anyReg (Any _ code) = return code | 1459 | anyReg (Any _ code) = return code | ||

1082 | anyReg (Fixed rep reg fcode) = return (\dst -> fcode `snocOL` reg2reg rep reg dst) | 1460 | anyReg (Fixed rep reg fcode) | ||

1083 | 1461 | = return (\dst -> fcode `snocOL` reg2reg rep reg dst) | |||

1084 | -- A bit like getSomeReg, but we want a reg that can be byte-addressed. | 1462 | -- A bit like getSomeReg, but we want a reg that can be byte-addressed. | ||

1085 | -- Fixed registers might not be byte-addressable, so we make sure we've | 1463 | -- Fixed registers might not be byte-addressable, so we make sure we've | ||

1086 | -- got a temporary, inserting an extra reg copy if necessary. | 1464 | -- got a temporary, inserting an extra reg copy if necessary. | ||

1087 | getByteReg :: CmmExpr -> NatM (Reg, InstrBlock) | 1465 | getByteReg :: CmmExpr -> NatM (Reg, InstrBlock) | ||

1088 | getByteReg expr = do | 1466 | getByteReg expr = do | ||

1089 | is32Bit <- is32BitPlatform | 1467 | is32Bit <- is32BitPlatform | ||

1090 | if is32Bit | 1468 | if is32Bit | ||

1091 | then do r <- getRegister expr | 1469 | then do r <- getRegister expr | ||

1092 | case r of | 1470 | case r of | ||

1093 | Any rep code -> do | 1471 | Any rep code -> do | ||

1094 | tmp <- getNewRegNat rep | 1472 | tmp <- getNewRegNat rep | ||

1095 | return (tmp, code tmp) | 1473 | return (tmp, code tmp) | ||

1096 | Fixed rep reg code | 1474 | Fixed rep reg code | ||

1097 | | isVirtualReg reg -> return (reg,code) | 1475 | | isVirtualReg reg -> return (reg,code) | ||

1098 | | otherwise -> do | 1476 | | otherwise -> do | ||

1099 | tmp <- getNewRegNat rep | 1477 | tmp <- getNewRegNat rep | ||

1100 | return (tmp, code `snocOL` reg2reg rep reg tmp) | 1478 | return (tmp, code `snocOL` reg2reg rep reg tmp) | ||

1101 | -- ToDo: could optimise slightly by checking for | 1479 | -- ToDo: could optimise slightly by checking for | ||

1102 | -- byte-addressable real registers, but that will | 1480 | -- byte-addressable real registers, but that will | ||

1103 | -- happen very rarely if at all. | 1481 | -- happen very rarely if at all. | ||

1104 | else getSomeReg expr -- all regs are byte-addressable on x86_64 | 1482 | else getSomeReg expr -- all regs are byte-addressable on x86_64 | ||

notice that the comment on literally the next line states "all registers are byte addressable in x86_64" this is definitely true for float/double values, and thus reasonably true for simd too? edit: ohhh, this is the 32bit case carter: notice that the comment on literally the next line states "all registers are byte addressable… | |||||

1105 | 1483 | | |||

1106 | -- Another variant: this time we want the result in a register that cannot | 1484 | -- Another variant: this time we want the result in a register that cannot | ||

1107 | -- be modified by code to evaluate an arbitrary expression. | 1485 | -- be modified by code to evaluate an arbitrary expression. | ||

1108 | getNonClobberedReg :: CmmExpr -> NatM (Reg, InstrBlock) | 1486 | getNonClobberedReg :: CmmExpr -> NatM (Reg, InstrBlock) | ||

1109 | getNonClobberedReg expr = do | 1487 | getNonClobberedReg expr = do | ||

1110 | dflags <- getDynFlags | 1488 | dflags <- getDynFlags | ||

1111 | r <- getRegister expr | 1489 | r <- getRegister expr | ||

1112 | case r of | 1490 | case r of | ||

1113 | Any rep code -> do | 1491 | Any rep code -> do | ||

1114 | tmp <- getNewRegNat rep | 1492 | tmp <- getNewRegNat rep | ||

1115 | return (tmp, code tmp) | 1493 | return (tmp, code tmp) | ||

1116 | Fixed rep reg code | 1494 | Fixed rep reg code | ||

1117 | -- only certain regs can be clobbered | 1495 | -- only certain regs can be clobbered | ||

1118 | | reg `elem` instrClobberedRegs (targetPlatform dflags) | 1496 | | reg `elem` instrClobberedRegs (targetPlatform dflags) | ||

1119 | -> do | 1497 | -> do | ||

1120 | tmp <- getNewRegNat rep | 1498 | tmp <- getNewRegNat rep | ||

1121 | return (tmp, code `snocOL` reg2reg rep reg tmp) | 1499 | return (tmp, code `snocOL` reg2reg rep reg tmp) | ||

1122 | | otherwise -> | 1500 | | otherwise -> | ||

1123 | return (reg, code) | 1501 | return (reg, code) | ||

1124 | 1502 | | |||

1125 | reg2reg :: Format -> Reg -> Reg -> Instr | 1503 | reg2reg :: Format -> Reg -> Reg -> Instr | ||

1504 | reg2reg format@(VecFormat _ FmtFloat W32) src dst | ||||

1505 | = VMOVU format (OpReg src) (OpReg dst) | ||||

1506 | reg2reg format@(VecFormat _ FmtDouble W64) src dst | ||||

1507 | = VMOVU format (OpReg src) (OpReg dst) | ||||

1508 | reg2reg (VecFormat _ _ _) _ _ | ||||

1509 | = panic "MOV operation not implemented for vectors" | ||||

1126 | reg2reg format src dst | 1510 | reg2reg format src dst | ||

1127 | | format == FF80 = GMOV src dst | 1511 | | format == FF80 = GMOV src dst | ||

1128 | | otherwise = MOV format (OpReg src) (OpReg dst) | 1512 | | otherwise = MOV format (OpReg src) (OpReg dst) | ||

AndreasK: I think you will need to add a case handling vectors here. | |||||

1129 | 1513 | | |||

1130 | | ||||

1131 | -------------------------------------------------------------------------------- | 1514 | -------------------------------------------------------------------------------- | ||

1132 | getAmode :: CmmExpr -> NatM Amode | 1515 | getAmode :: CmmExpr -> NatM Amode | ||

1133 | getAmode e = do is32Bit <- is32BitPlatform | 1516 | getAmode e = do is32Bit <- is32BitPlatform | ||

1134 | getAmode' is32Bit e | 1517 | getAmode' is32Bit e | ||

1135 | 1518 | | |||

1136 | getAmode' :: Bool -> CmmExpr -> NatM Amode | 1519 | getAmode' :: Bool -> CmmExpr -> NatM Amode | ||

1137 | getAmode' _ (CmmRegOff r n) = do dflags <- getDynFlags | 1520 | getAmode' _ (CmmRegOff r n) = do dflags <- getDynFlags | ||

1138 | getAmode $ mangleIndexTree dflags r n | 1521 | getAmode $ mangleIndexTree dflags r n | ||

▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Line(s) | 1564 | [x, CmmMachOp (MO_Add _) | |||

1183 | CmmLit (CmmInt offset _)]]) | 1566 | CmmLit (CmmInt offset _)]]) | ||

1184 | | shift == 0 || shift == 1 || shift == 2 || shift == 3 | 1567 | | shift == 0 || shift == 1 || shift == 2 || shift == 3 | ||

1185 | && is32BitInteger offset | 1568 | && is32BitInteger offset | ||

1186 | = x86_complex_amode x y shift offset | 1569 | = x86_complex_amode x y shift offset | ||

1187 | 1570 | | |||

1188 | getAmode' _ (CmmMachOp (MO_Add _) [x,y]) | 1571 | getAmode' _ (CmmMachOp (MO_Add _) [x,y]) | ||

1189 | = x86_complex_amode x y 0 0 | 1572 | = x86_complex_amode x y 0 0 | ||

1190 | 1573 | | |||

1574 | getAmode' _ (CmmLit lit@(CmmFloat _ w)) | ||||

1575 | = memConstant (widthInBytes w) lit | ||||

1576 | | ||||

1191 | getAmode' is32Bit (CmmLit lit) | is32BitLit is32Bit lit | 1577 | getAmode' is32Bit (CmmLit lit) | is32BitLit is32Bit lit | ||

1192 | = return (Amode (ImmAddr (litToImm lit) 0) nilOL) | 1578 | = return (Amode (ImmAddr (litToImm lit) 0) nilOL) | ||

1193 | 1579 | | |||

1194 | getAmode' _ expr = do | 1580 | getAmode' _ expr = do | ||

1195 | (reg,code) <- getSomeReg expr | 1581 | (reg,code) <- getSomeReg expr | ||

1196 | return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code) | 1582 | return (Amode (AddrBaseIndex (EABaseReg reg) EAIndexNone (ImmInt 0)) code) | ||

1197 | 1583 | | |||

1198 | -- | Like 'getAmode', but on 32-bit use simple register addressing | 1584 | -- | Like 'getAmode', but on 32-bit use simple register addressing | ||

▲ Show 20 Lines • Show All 372 Lines • ▼ Show 20 Line(s) | |||||

1571 | -- (e.g. the result of a call). | 1957 | -- (e.g. the result of a call). | ||

1572 | 1958 | | |||

1573 | assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock | 1959 | assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock | ||

1574 | assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock | 1960 | assignReg_IntCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock | ||

1575 | 1961 | | |||

1576 | assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock | 1962 | assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock | ||

1577 | assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock | 1963 | assignReg_FltCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock | ||

1578 | 1964 | | |||

1579 | 1965 | assignMem_VecCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock | |||

1966 | assignReg_VecCode :: Format -> CmmReg -> CmmExpr -> NatM InstrBlock | ||||

1580 | -- integer assignment to memory | 1967 | -- integer assignment to memory | ||

1581 | 1968 | | |||

1582 | -- specific case of adding/subtracting an integer to a particular address. | 1969 | -- specific case of adding/subtracting an integer to a particular address. | ||

1583 | -- ToDo: catch other cases where we can use an operation directly on a memory | 1970 | -- ToDo: catch other cases where we can use an operation directly on a memory | ||

1584 | -- address. | 1971 | -- address. | ||

1585 | assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _, | 1972 | assignMem_IntCode pk addr (CmmMachOp op [CmmLoad addr2 _, | ||

1586 | CmmLit (CmmInt i _)]) | 1973 | CmmLit (CmmInt i _)]) | ||

1587 | | addr == addr2, pk /= II64 || is32BitInteger i, | 1974 | | addr == addr2, pk /= II64 || is32BitInteger i, | ||

▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Line(s) | |||||

1629 | 2016 | | |||

1630 | -- dst is a reg, but src could be anything | 2017 | -- dst is a reg, but src could be anything | ||

1631 | assignReg_IntCode _ reg src = do | 2018 | assignReg_IntCode _ reg src = do | ||

1632 | dflags <- getDynFlags | 2019 | dflags <- getDynFlags | ||

1633 | let platform = targetPlatform dflags | 2020 | let platform = targetPlatform dflags | ||

1634 | code <- getAnyReg src | 2021 | code <- getAnyReg src | ||

1635 | return (code (getRegisterReg platform False{-no sse2-} reg)) | 2022 | return (code (getRegisterReg platform False{-no sse2-} reg)) | ||

1636 | 2023 | | |||

1637 | | ||||

1638 | -- Floating point assignment to memory | 2024 | -- Floating point assignment to memory | ||

1639 | assignMem_FltCode pk addr src = do | 2025 | assignMem_FltCode pk addr src = do | ||

1640 | (src_reg, src_code) <- getNonClobberedReg src | 2026 | (src_reg, src_code) <- getNonClobberedReg src | ||

1641 | Amode addr addr_code <- getAmode addr | 2027 | Amode addr addr_code <- getAmode addr | ||

1642 | use_sse2 <- sse2Enabled | 2028 | use_sse2 <- sse2Enabled | ||

1643 | let | 2029 | let | ||

1644 | code = src_code `appOL` | 2030 | code = src_code `appOL` | ||

1645 | addr_code `snocOL` | 2031 | addr_code `snocOL` | ||

1646 | if use_sse2 then MOV pk (OpReg src_reg) (OpAddr addr) | 2032 | if use_sse2 then MOV pk (OpReg src_reg) (OpAddr addr) | ||

1647 | else GST pk src_reg addr | 2033 | else GST pk src_reg addr | ||

1648 | return code | 2034 | return code | ||

1649 | 2035 | | |||

1650 | -- Floating point assignment to a register/temporary | 2036 | -- Floating point assignment to a register/temporary | ||

1651 | assignReg_FltCode _ reg src = do | 2037 | assignReg_FltCode _ reg src = do | ||

1652 | use_sse2 <- sse2Enabled | 2038 | use_sse2 <- sse2Enabled | ||

1653 | src_code <- getAnyReg src | 2039 | src_code <- getAnyReg src | ||

1654 | dflags <- getDynFlags | 2040 | dflags <- getDynFlags | ||

1655 | let platform = targetPlatform dflags | 2041 | let platform = targetPlatform dflags | ||

1656 | return (src_code (getRegisterReg platform use_sse2 reg)) | 2042 | return (src_code (getRegisterReg platform use_sse2 reg)) | ||

1657 | 2043 | | |||

2044 | assignMem_VecCode pk addr src = do | ||||

2045 | (src_reg, src_code) <- getNonClobberedReg src | ||||

2046 | Amode addr addr_code <- getAmode addr | ||||

2047 | use_avx <- avxEnabled | ||||

2048 | use_sse <- sseEnabled | ||||

2049 | let | ||||

2050 | code | use_avx = src_code `appOL` | ||||

2051 | addr_code `snocOL` | ||||

2052 | (VMOVU pk (OpReg src_reg) (OpAddr addr)) | ||||

2053 | | use_sse = src_code `appOL` | ||||

2054 | addr_code `snocOL` | ||||

2055 | (MOVU pk (OpReg src_reg) (OpAddr addr)) | ||||

2056 | | otherwise = sorry "Please enable the -mavx or -msse flag" | ||||

2057 | return code | ||||

2058 | | ||||

2059 | assignReg_VecCode format reg src = do | ||||

2060 | use_avx <- avxEnabled | ||||

2061 | use_sse <- sseEnabled | ||||

2062 | src_code <- getAnyReg src | ||||

2063 | dflags <- getDynFlags | ||||

2064 | let platform = targetPlatform dflags | ||||

2065 | flag = use_avx || use_sse | ||||

2066 | return (src_code (getVecRegisterReg platform flag format reg)) | ||||

1658 | 2067 | | |||

1659 | genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock | 2068 | genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock | ||

1660 | 2069 | | |||

1661 | genJump (CmmLoad mem _) regs = do | 2070 | genJump (CmmLoad mem _) regs = do | ||

1662 | Amode target code <- getAmode mem | 2071 | Amode target code <- getAmode mem | ||

1663 | return (code `snocOL` JMP (OpAddr target) regs) | 2072 | return (code `snocOL` JMP (OpAddr target) regs) | ||

1664 | 2073 | | |||

1665 | genJump (CmmLit lit) regs = do | 2074 | genJump (CmmLit lit) regs = do | ||

▲ Show 20 Lines • Show All 1531 Lines • ▼ Show 20 Line(s) | 3605 | code dst | |||

3197 | = b_code dst `snocOL` | 3606 | = b_code dst `snocOL` | ||

3198 | revinstr (OpImm (litToImm lit_a)) (OpReg dst) | 3607 | revinstr (OpImm (litToImm lit_a)) (OpReg dst) | ||

3199 | return (Any (intFormat width) code) | 3608 | return (Any (intFormat width) code) | ||

3200 | 3609 | | |||

3201 | trivialCode' _ width instr _ a b | 3610 | trivialCode' _ width instr _ a b | ||

3202 | = genTrivialCode (intFormat width) instr a b | 3611 | = genTrivialCode (intFormat width) instr a b | ||

3203 | 3612 | | |||

3204 | -- This is re-used for floating pt instructions too. | 3613 | -- This is re-used for floating pt instructions too. | ||

3205 | genTrivialCode :: Format -> (Operand -> Operand -> Instr) | 3614 | genTrivialCode :: Format -> (Operand -> Operand -> Instr) | ||

ohhh, i see, previously all floating point stuff was SSE style, which like general purpose register operations, UPDATE one of their arguments, rather than having the result register selected by the register allocator. that said, we will still want to support SSE2-sse4 vector instructions, which DO clobber carter: ohhh, i see, previously all floating point stuff was SSE style, which like general purpose… | |||||

3206 | -> CmmExpr -> CmmExpr -> NatM Register | 3615 | -> CmmExpr -> CmmExpr -> NatM Register | ||

3207 | genTrivialCode rep instr a b = do | 3616 | genTrivialCode rep instr a b = do | ||

3208 | (b_op, b_code) <- getNonClobberedOperand b | 3617 | (b_op, b_code) <- getNonClobberedOperand b | ||

3209 | a_code <- getAnyReg a | 3618 | a_code <- getAnyReg a | ||

3210 | tmp <- getNewRegNat rep | 3619 | tmp <- getNewRegNat rep | ||

3211 | let | 3620 | let | ||

3212 | -- We want the value of b to stay alive across the computation of a. | 3621 | -- We want the value of b to stay alive across the computation of a. | ||

3213 | -- But, we want to calculate a straight into the destination register, | 3622 | -- But, we want to calculate a straight into the destination register, | ||

▲ Show 20 Lines • Show All 128 Lines • ▼ Show 20 Line(s) | |||||

3342 | 3751 | | |||

3343 | sse2NegCode :: Width -> CmmExpr -> NatM Register | 3752 | sse2NegCode :: Width -> CmmExpr -> NatM Register | ||

3344 | sse2NegCode w x = do | 3753 | sse2NegCode w x = do | ||

3345 | let fmt = floatFormat w | 3754 | let fmt = floatFormat w | ||

3346 | x_code <- getAnyReg x | 3755 | x_code <- getAnyReg x | ||

3347 | -- This is how gcc does it, so it can't be that bad: | 3756 | -- This is how gcc does it, so it can't be that bad: | ||

3348 | let | 3757 | let | ||

3349 | const = case fmt of | 3758 | const = case fmt of | ||

3350 | FF32 -> CmmInt 0x80000000 W32 | 3759 | FF32 -> CmmInt 0x80000000 W32 | ||

3351 | FF64 -> CmmInt 0x8000000000000000 W64 | 3760 | FF64 -> CmmInt 0x8000000000000000 W64 | ||

3352 | x@II8 -> wrongFmt x | 3761 | x@II8 -> wrongFmt x | ||

3353 | x@II16 -> wrongFmt x | 3762 | x@II16 -> wrongFmt x | ||

3354 | x@II32 -> wrongFmt x | 3763 | x@II32 -> wrongFmt x | ||

3355 | x@II64 -> wrongFmt x | 3764 | x@II64 -> wrongFmt x | ||

3356 | x@FF80 -> wrongFmt x | 3765 | x@FF80 -> wrongFmt x | ||

3766 | x@VecFormat {} -> wrongFmt x | ||||

If there is a simd "negate vector" instructions this would probably be the place to use it. AndreasK: If there is a simd "negate vector" instructions this would probably be the place to use it. | |||||

3357 | where | 3767 | where | ||

3358 | wrongFmt x = panic $ "sse2NegCode: " ++ show x | 3768 | wrongFmt x = panic $ "sse2NegCode: " ++ show x | ||

3359 | Amode amode amode_code <- memConstant (widthInBytes w) const | 3769 | Amode amode amode_code <- memConstant (widthInBytes w) const | ||

3360 | tmp <- getNewRegNat fmt | 3770 | tmp <- getNewRegNat fmt | ||

3361 | let | 3771 | let | ||

3362 | code dst = x_code dst `appOL` amode_code `appOL` toOL [ | 3772 | code dst = x_code dst `appOL` amode_code `appOL` toOL [ | ||

3363 | MOV fmt (OpAddr amode) (OpReg tmp), | 3773 | MOV fmt (OpAddr amode) (OpReg tmp), | ||

3364 | XOR fmt (OpReg tmp) (OpReg dst) | 3774 | XOR fmt (OpReg tmp) (OpReg dst) | ||

3365 | ] | 3775 | ] | ||

3366 | -- | 3776 | -- | ||

3367 | return (Any fmt code) | 3777 | return (Any fmt code) | ||

3368 | 3778 | | |||

3369 | isVecExpr :: CmmExpr -> Bool | 3779 | isVecExpr :: CmmExpr -> Bool | ||

3370 | isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True | 3780 | isVecExpr (CmmMachOp (MO_V_Insert {}) _) = True | ||

3371 | isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True | 3781 | isVecExpr (CmmMachOp (MO_V_Extract {}) _) = True | ||

3372 | isVecExpr (CmmMachOp (MO_V_Add {}) _) = True | 3782 | isVecExpr (CmmMachOp (MO_V_Add {}) _) = True | ||

3373 | isVecExpr (CmmMachOp (MO_V_Sub {}) _) = True | 3783 | isVecExpr (CmmMachOp (MO_V_Sub {}) _) = True | ||

3374 | isVecExpr (CmmMachOp (MO_V_Mul {}) _) = True | 3784 | isVecExpr (CmmMachOp (MO_V_Mul {}) _) = True | ||

3375 | isVecExpr (CmmMachOp (MO_VS_Quot {}) _) = True | 3785 | isVecExpr (CmmMachOp (MO_VS_Quot {}) _) = True | ||

3376 | isVecExpr (CmmMachOp (MO_VS_Rem {}) _) = True | 3786 | isVecExpr (CmmMachOp (MO_VS_Rem {}) _) = True | ||

3377 | isVecExpr (CmmMachOp (MO_VS_Neg {}) _) = True | 3787 | isVecExpr (CmmMachOp (MO_VS_Neg {}) _) = True | ||

3378 | isVecExpr (CmmMachOp (MO_VF_Insert {}) _) = True | 3788 | isVecExpr (CmmMachOp (MO_VF_Broadcast {}) _) = True | ||

3379 | isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True | 3789 | isVecExpr (CmmMachOp (MO_VF_Insert {}) _) = True | ||

3380 | isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True | 3790 | isVecExpr (CmmMachOp (MO_VF_Extract {}) _) = True | ||

3381 | isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True | 3791 | isVecExpr (CmmMachOp (MO_VF_Add {}) _) = True | ||

3382 | isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True | 3792 | isVecExpr (CmmMachOp (MO_VF_Sub {}) _) = True | ||

3383 | isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True | 3793 | isVecExpr (CmmMachOp (MO_VF_Mul {}) _) = True | ||

3384 | isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True | 3794 | isVecExpr (CmmMachOp (MO_VF_Quot {}) _) = True | ||

3385 | isVecExpr (CmmMachOp _ [e]) = isVecExpr e | 3795 | isVecExpr (CmmMachOp (MO_VF_Neg {}) _) = True | ||

3386 | isVecExpr _ = False | 3796 | isVecExpr (CmmMachOp _ [e]) = isVecExpr e | ||

3797 | isVecExpr _ = False | ||||

3387 | 3798 | | |||

3388 | needLlvm :: NatM a | 3799 | needLlvm :: NatM a | ||

3389 | needLlvm = | 3800 | needLlvm = | ||

3390 | sorry $ unlines ["The native code generator does not support vector" | 3801 | sorry $ unlines ["The native code generator does not support vector" | ||

3391 | ,"instructions. Please use -fllvm."] | 3802 | ,"instructions. Please use -fllvm."] | ||

3803 | | ||||

3804 | incorrectOperands :: NatM a | ||||

3805 | incorrectOperands = sorry "Incorrect number of operands" |

are these for Floats/Doubles or Int32/Word32/Int64/word64?