; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s

target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"

; Stack reload folding tests.
;
; By including a nop call with sideeffects we can force a partial register spill of the
; relevant registers and check that the reload is correctly folded into the instruction.

define <32 x half> @stack_fold_fmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmadd123ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
  ret <32 x half> %2
}
declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>)

define <32 x half> @stack_fold_fmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmadd213ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
  ret <32 x half> %2
}

define <32 x half> @stack_fold_fmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmadd231ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
  ret <32 x half> %2
}

define <32 x half> @stack_fold_fmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmadd321ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
  ret <32 x half> %2
}

define <32 x half> @stack_fold_fmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmadd132ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
  ret <32 x half> %2
}

define <32 x half> @stack_fold_fmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmadd312ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
  ret <32 x half> %2
}

define <32 x half> @stack_fold_fmadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmadd123ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmadd213ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmadd231ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmadd321ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmadd132ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmadd312ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd123ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd213ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd231ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd321ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd132ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd312ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmsub123ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a2
  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %2)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmsub213ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a2
  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %2)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmsub231ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a0
  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %2)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmsub321ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a0
  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %2)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmsub132ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a1
  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %2)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmsub312ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a1
  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %2)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fmsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmsub123ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmsub213ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmsub231ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmsub321ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmsub132ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmsub312ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub123ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub213ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub231ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub321ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub132ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub312ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fnmadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fnmadd123ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a0
  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a2)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fnmadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fnmadd213ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a1
  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a2)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fnmadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fnmadd231ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a1
  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a0)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fnmadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fnmadd321ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a2
  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %a0)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fnmadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fnmadd132ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a0
  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %a1)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fnmadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fnmadd312ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a2
  %3 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %a1)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fnmadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fnmadd123ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fnmadd213ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fnmadd231ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fnmadd321ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fnmadd132ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fnmadd312ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd123ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a2)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fnmadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd213ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfnmadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a2)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fnmadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd231ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a0)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fnmadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd321ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfnmadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a1, <32 x half> %a0)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fnmadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd132ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a2, <32 x half> %a1)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fnmadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd312ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfnmadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg, <32 x half> %a0, <32 x half> %a1)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fnmsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fnmsub123ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a0
  %3 = fneg <32 x half> %a2
  %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %3)
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fnmsub213ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a1
  %3 = fneg <32 x half> %a2
  %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %3)
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fnmsub231ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a1
  %3 = fneg <32 x half> %a0
  %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %3)
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fnmsub321ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a2
  %3 = fneg <32 x half> %a0
  %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a1, <32 x half> %3)
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fnmsub132ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a0
  %3 = fneg <32 x half> %a1
  %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a2, <32 x half> %3)
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fnmsub312ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a2
  %3 = fneg <32 x half> %a1
  %4 = call <32 x half> @llvm.fma.v32f16(<32 x half> %2, <32 x half> %a0, <32 x half> %3)
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fnmsub123ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a2
  %neg1 = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fnmsub213ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a2
  %neg1 = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fnmsub231ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a0
  %neg1 = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fnmsub321ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a0
  %neg1 = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fnmsub132ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a1
  %neg1 = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fnmsub312ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a1
  %neg1 = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fnmsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub123ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a2
  %neg1 = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fnmsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub213ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfnmsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a2
  %neg1 = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fnmsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub231ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a0
  %neg1 = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fnmsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub321ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfnmsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a0
  %neg1 = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a1, <32 x half> %neg)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fnmsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub132ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a1
  %neg1 = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a2, <32 x half> %neg)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fnmsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub312ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfnmsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a1
  %neg1 = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.fma.v32f16(<32 x half> %neg1, <32 x half> %a0, <32 x half> %neg)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define half @stack_fold_fmadd123sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fmadd123sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
  ret half %2
}
declare half @llvm.fma.f16(half, half, half)

define half @stack_fold_fmadd213sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fmadd213sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
  ret half %2
}

define half @stack_fold_fmadd231sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fmadd231sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
  ret half %2
}

define half @stack_fold_fmadd321sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fmadd321sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
  ret half %2
}

define half @stack_fold_fmadd132sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fmadd132sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
  ret half %2
}

define half @stack_fold_fmadd312sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fmadd312sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
  ret half %2
}

define half @stack_fold_fmsub123sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fmsub123sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a2
  %3 = call half @llvm.fma.f16(half %a0, half %a1, half %2)
  ret half %3
}

define half @stack_fold_fmsub213sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fmsub213sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a2
  %3 = call half @llvm.fma.f16(half %a1, half %a0, half %2)
  ret half %3
}

define half @stack_fold_fmsub231sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fmsub231sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a0
  %3 = call half @llvm.fma.f16(half %a1, half %a2, half %2)
  ret half %3
}

define half @stack_fold_fmsub321sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fmsub321sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a0
  %3 = call half @llvm.fma.f16(half %a2, half %a1, half %2)
  ret half %3
}

define half @stack_fold_fmsub132sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fmsub132sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a1
  %3 = call half @llvm.fma.f16(half %a0, half %a2, half %2)
  ret half %3
}

define half @stack_fold_fmsub312sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fmsub312sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a1
  %3 = call half @llvm.fma.f16(half %a2, half %a0, half %2)
  ret half %3
}

define half @stack_fold_fnmadd123sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fnmadd123sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a0
  %3 = call half @llvm.fma.f16(half %2, half %a1, half %a2)
  ret half %3
}

define half @stack_fold_fnmadd213sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fnmadd213sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a1
  %3 = call half @llvm.fma.f16(half %2, half %a0, half %a2)
  ret half %3
}

define half @stack_fold_fnmadd231sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fnmadd231sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a1
  %3 = call half @llvm.fma.f16(half %2, half %a2, half %a0)
  ret half %3
}

define half @stack_fold_fnmadd321sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fnmadd321sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a2
  %3 = call half @llvm.fma.f16(half %2, half %a1, half %a0)
  ret half %3
}

define half @stack_fold_fnmadd132sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fnmadd132sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a0
  %3 = call half @llvm.fma.f16(half %2, half %a2, half %a1)
  ret half %3
}

define half @stack_fold_fnmadd312sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fnmadd312sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a2
  %3 = call half @llvm.fma.f16(half %2, half %a0, half %a1)
  ret half %3
}

define half @stack_fold_fnmsub123sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fnmsub123sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a0
  %3 = fneg half %a2
  %4 = call half @llvm.fma.f16(half %2, half %a1, half %3)
  ret half %4
}

define half @stack_fold_fnmsub213sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fnmsub213sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a1
  %3 = fneg half %a2
  %4 = call half @llvm.fma.f16(half %2, half %a0, half %3)
  ret half %4
}

define half @stack_fold_fnmsub231sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fnmsub231sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a1
  %3 = fneg half %a0
  %4 = call half @llvm.fma.f16(half %2, half %a2, half %3)
  ret half %4
}

define half @stack_fold_fnmsub321sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fnmsub321sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a2
  %3 = fneg half %a0
  %4 = call half @llvm.fma.f16(half %2, half %a1, half %3)
  ret half %4
}

define half @stack_fold_fnmsub132sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fnmsub132sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a0
  %3 = fneg half %a1
  %4 = call half @llvm.fma.f16(half %2, half %a2, half %3)
  ret half %4
}

define half @stack_fold_fnmsub312sh(half %a0, half %a1, half %a2) {
; CHECK-LABEL: stack_fold_fnmsub312sh:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovsh %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg half %a2
  %3 = fneg half %a1
  %4 = call half @llvm.fma.f16(half %2, half %a0, half %3)
  ret half %4
}

define <8 x half> @stack_fold_fmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fmadd123sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fmadd213sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fmadd231sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fmadd321sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fmadd132sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fmadd312sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fmsub123sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a2
  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fmsub213sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a2
  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fmsub231sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a0
  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fmsub321sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a0
  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fmsub132sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a1
  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fmsub312sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a1
  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fnmadd123sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a0
  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fnmadd213sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a1
  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fnmadd231sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a1
  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fnmadd321sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a2
  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fnmadd132sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a0
  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fnmadd312sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a2
  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub123sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fnmsub123sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a2
  %neg1 = fneg half %a0
  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub213sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fnmsub213sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a2
  %neg1 = fneg half %a1
  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub231sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fnmsub231sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a0
  %neg1 = fneg half %a1
  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub321sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fnmsub321sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a0
  %neg1 = fneg half %a2
  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub132sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fnmsub132sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a1
  %neg1 = fneg half %a0
  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub312sh_int(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v) {
; CHECK-LABEL: stack_fold_fnmsub312sh_int:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a1
  %neg1 = fneg half %a2
  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
  %res = insertelement <8 x half> %a0v, half %2, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd123sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd213sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd231sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd321sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd132sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd312sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub123sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a2
  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub213sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a2
  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub231sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a0
  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub321sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a0
  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub132sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a1
  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub312sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a1
  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd123sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a0
  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd213sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a1
  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd231sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a1
  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd321sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a2
  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd132sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a0
  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd312sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a2
  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub123sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub123sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a2
  %neg1 = fneg half %a0
  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub213sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub213sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a2
  %neg1 = fneg half %a1
  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub231sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub231sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a0
  %neg1 = fneg half %a1
  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub321sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub321sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a0
  %neg1 = fneg half %a2
  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub132sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub132sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a1
  %neg1 = fneg half %a0
  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub312sh_intk(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub312sh_intk:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a1
  %neg1 = fneg half %a2
  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half %a0
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd123sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %a2)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd213sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %a2)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd231sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %a0)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd321sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %a0)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd132sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %a1)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmadd312sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %a1)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub123sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a2
  %2 = call half @llvm.fma.f16(half %a0, half %a1, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub213sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a2
  %2 = call half @llvm.fma.f16(half %a1, half %a0, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub231sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a0
  %2 = call half @llvm.fma.f16(half %a1, half %a2, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub321sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a0
  %2 = call half @llvm.fma.f16(half %a2, half %a1, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub132sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a1
  %2 = call half @llvm.fma.f16(half %a0, half %a2, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsub312sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a1
  %2 = call half @llvm.fma.f16(half %a2, half %a0, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd123sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a0
  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a2)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd213sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmadd213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a1
  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a2)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd231sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a1
  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a0)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd321sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmadd231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a2
  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %a0)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd132sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a0
  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %a1)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmadd312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmadd312sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmadd132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg1 = fneg half %a2
  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %a1)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub123sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub123sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a2
  %neg1 = fneg half %a0
  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub213sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub213sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmsub213sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a2
  %neg1 = fneg half %a1
  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub231sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub231sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a0
  %neg1 = fneg half %a1
  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub321sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub321sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmsub231sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a0
  %neg1 = fneg half %a2
  %2 = call half @llvm.fma.f16(half %neg1, half %a1, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub132sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub132sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a1
  %neg1 = fneg half %a0
  %2 = call half @llvm.fma.f16(half %neg1, half %a2, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <8 x half> @stack_fold_fnmsub312sh_intkz(<8 x half> %a0v, <8 x half> %a1v, <8 x half> %a2v, ptr %mask) {
; CHECK-LABEL: stack_fold_fnmsub312sh_intkz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    movzbl (%rdi), %eax
; CHECK-NEXT:    kmovd %eax, %k1
; CHECK-NEXT:    vfnmsub132sh {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 {%k1} {z} # 16-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = extractelement <8 x half> %a0v, i64 0
  %a1 = extractelement <8 x half> %a1v, i64 0
  %a2 = extractelement <8 x half> %a2v, i64 0
  %neg = fneg half %a1
  %neg1 = fneg half %a2
  %2 = call half @llvm.fma.f16(half %neg1, half %a0, half %neg)
  %3 = load i8, ptr %mask
  %4 = bitcast i8 %3 to <8 x i1>
  %5 = extractelement <8 x i1> %4, i64 0
  %6 = select i1 %5, half %2, half zeroinitializer
  %res = insertelement <8 x half> %a0v, half %6, i64 0
  ret <8 x half> %res
}

define <32 x half> @stack_fold_fmaddsub123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmaddsub123ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
  ret <32 x half> %2
}
declare <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half>, <32 x half>, <32 x half>, i32)

define <32 x half> @stack_fold_fmaddsub213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmaddsub213ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
  ret <32 x half> %2
}

define <32 x half> @stack_fold_fmaddsub231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmaddsub231ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
  ret <32 x half> %2
}

define <32 x half> @stack_fold_fmaddsub321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmaddsub321ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
  ret <32 x half> %2
}

define <32 x half> @stack_fold_fmaddsub132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmaddsub132ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
  ret <32 x half> %2
}

define <32 x half> @stack_fold_fmaddsub312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmaddsub312ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
  ret <32 x half> %2
}

define <32 x half> @stack_fold_fmaddsub123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmaddsub123ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmaddsub213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmaddsub213ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmaddsub231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmaddsub231ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmaddsub321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmaddsub321ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmaddsub132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmaddsub132ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmaddsub312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmaddsub312ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmaddsub123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmaddsub123ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, i32 4)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmaddsub213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmaddsub213ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmaddsub213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %a2, i32 4)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmaddsub231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmaddsub231ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %a0, i32 4)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmaddsub321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmaddsub321ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmaddsub231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %a0, i32 4)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmaddsub132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmaddsub132ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %a1, i32 4)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmaddsub312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmaddsub312ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmaddsub132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %a1, i32 4)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmsubadd123ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmsubadd123ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a2
  %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %2, i32 4)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fmsubadd213ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmsubadd213ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a2
  %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %2, i32 4)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fmsubadd231ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmsubadd231ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a0
  %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %2, i32 4)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fmsubadd321ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmsubadd321ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a0
  %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %2, i32 4)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fmsubadd132ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmsubadd132ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a1
  %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %2, i32 4)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fmsubadd312ph(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2) {
; CHECK-LABEL: stack_fold_fmsubadd312ph:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %2 = fneg <32 x half> %a1
  %3 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %2, i32 4)
  ret <32 x half> %3
}

define <32 x half> @stack_fold_fmsubadd123ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmsubadd123ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmsubadd213ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmsubadd213ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmsubadd231ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmsubadd231ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmsubadd321ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmsubadd321ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmsubadd132ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmsubadd132ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmsubadd312ph_mask(ptr %p, <32 x half> %a1, <32 x half> %a2, i32 %mask) {
; CHECK-LABEL: stack_fold_fmsubadd312ph_mask:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    vmovaps (%rdi), %zmm2
; CHECK-NEXT:    kmovd %esi, %k1
; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 64-byte Folded Reload
; CHECK-NEXT:    vmovaps %zmm2, %zmm0
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %a0 = load <32 x half>, ptr %p
  %neg = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4)
  %3 = bitcast i32 %mask to <32 x i1>
  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> %a0
  ret <32 x half> %4
}

define <32 x half> @stack_fold_fmsubadd123ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsubadd123ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a1, <32 x half> %neg, i32 4)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmsubadd213ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsubadd213ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmsubadd213ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a2
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a0, <32 x half> %neg, i32 4)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmsubadd231ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsubadd231ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a1, <32 x half> %a2, <32 x half> %neg, i32 4)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmsubadd321ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsubadd321ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmsubadd231ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a0
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a1, <32 x half> %neg, i32 4)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmsubadd132ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsubadd132ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a0, <32 x half> %a2, <32 x half> %neg, i32 4)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}

define <32 x half> @stack_fold_fmsubadd312ph_maskz(<32 x half> %a0, <32 x half> %a1, <32 x half> %a2, ptr %mask) {
; CHECK-LABEL: stack_fold_fmsubadd312ph_maskz:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT:    #APP
; CHECK-NEXT:    nop
; CHECK-NEXT:    #NO_APP
; CHECK-NEXT:    kmovd (%rdi), %k1
; CHECK-NEXT:    vfmsubadd132ph {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 {%k1} {z} # 64-byte Folded Reload
; CHECK-NEXT:    retq
  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
  %neg = fneg <32 x half> %a1
  %2 = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %a2, <32 x half> %a0, <32 x half> %neg, i32 4)
  %3 = load i32, ptr %mask
  %4 = bitcast i32 %3 to <32 x i1>
  %5 = select <32 x i1> %4, <32 x half> %2, <32 x half> zeroinitializer
  ret <32 x half> %5
}
