DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched > Struct Template Reference#
Classes |
Public Types |
Public Member Functions |
Static Public Member Functions |
Static Public Attributes |
List of all members
ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched > Struct Template Reference
#include <device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp>
Inheritance diagram for ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched >:
Classes | |
| struct | Argument |
| struct | Invoker |
Public Types | |
| using | DeviceOp = DeviceGemmMultipleDMultipleR_Xdl_CShuffle |
| using | AGridDesc_M_K = decltype(MakeAGridDescriptor_M_K(1, 1, 1)) |
| using | BGridDesc_N_K = decltype(MakeBGridDescriptor_N_K(1, 1, 1)) |
| using | EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N(1, 1, 1)) |
| using | RGridDesc_M = decltype(MakeRGridDescriptor_M(1)) |
| template<index_t NXdlPerWave_> | |
| using | GridwiseGemmBase |
| using | GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)> |
| using | GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32> |
| using | AGridDesc_AK0_M_AK1 |
| using | BGridDesc_BK0_N_BK1 |
| using | Block2ETileMap = typename GridwiseGemm64::DefaultBlock2ETileMap |
Public Member Functions | |
| bool | IsSupportedArgument (const BaseArgument *p_arg) override |
| std::unique_ptr< BaseArgument > | MakeArgumentPointer (const void *p_a, const void *p_b, std::array< const void *, NumDTensor > p_ds, void *p_e, std::array< void *, NumRTensor > p_rs, index_t MRaw, index_t NRaw, index_t KRaw, index_t StrideA, index_t StrideB, std::array< index_t, NumDTensor > StrideDs, index_t StrideE, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CDEElementwiseOperation cde_element_op, QsElementwiseOperation qs_element_op, RsElementwiseOperation rs_element_op) override |
| std::unique_ptr< BaseInvoker > | MakeInvokerPointer () override |
| std::string | GetTypeString () const override |
| Public Member Functions inherited from ck::tensor_operation::device::BaseOperator | |
| BaseOperator ()=default | |
| BaseOperator (const BaseOperator &)=default | |
| BaseOperator & | operator= (const BaseOperator &)=default |
| virtual std::string | GetInstanceString () const |
| virtual std::string | GetTypeIdName () const |
| virtual std::optional< std::string > | GetObjectName () const |
| virtual std::optional< std::string > | GetTemplateInfo () const |
| virtual std::string | GetTypeIdHashCode () const |
| virtual size_t | GetWorkSpaceSize (const BaseArgument *) const |
| virtual void | SetWorkSpacePointer (BaseArgument *p_arg, void *p_workspace, const StreamConfig &=StreamConfig{}) const |
| virtual | ~BaseOperator () |
Static Public Member Functions | |
| static auto | MakeAGridDescriptor_M_K (index_t MRaw, index_t KRaw, index_t StrideA) |
| static auto | MakeBGridDescriptor_N_K (index_t KRaw, index_t NRaw, index_t StrideB) |
| static auto | MakeEGridDescriptor_M_N (index_t MRaw, index_t NRaw, index_t StrideE) |
| static auto | MakeRGridDescriptor_M (index_t MRaw) |
| static bool | IsSupportedArgument (const Argument &arg) |
| static auto | MakeArgument (const void *p_a, const void *p_b, std::array< const void *, NumDTensor > p_ds, void *p_e, std::array< void *, NumRTensor > p_rs, index_t MRaw, index_t NRaw, index_t KRaw, index_t StrideA, index_t StrideB, std::array< index_t, NumDTensor > StrideDs, index_t StrideE, AElementwiseOperation a_element_op, BElementwiseOperation b_element_op, CDEElementwiseOperation cde_element_op, QsElementwiseOperation qs_element_op, RsElementwiseOperation rs_element_op) |
| static auto | MakeInvoker () |
Static Public Attributes | |
| static GET_NXDL_PER_WAVE_IMPL constexpr auto | NXdlPerWave64 = GetNXdlPerWave<true>() |
| static constexpr auto | NXdlPerWave32 = GetNXdlPerWave<false>() |
| static constexpr index_t | NumDTensor = DsDataType::Size() |
| static constexpr index_t | NumRTensor = RsDataType::Size() |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | I2 = Number<2>{} |
| static constexpr auto | I3 = Number<3>{} |
| static constexpr auto | matrix_padder |
| Static Public Attributes inherited from ck::tensor_operation::device::DeviceGemmMultipleDMultipleR< ALayout, BLayout, DELayout, ADataType, BDataType, DsDataType, EDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation > | |
| static constexpr index_t | NumDTensor = DsDataType::Size() |
| static constexpr index_t | NumRTensor = RsDataType::Size() |
Member Typedef Documentation
◆ AGridDesc_AK0_M_AK1
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
| using ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched >::AGridDesc_AK0_M_AK1 |
Initial value:
AGridDesc_M_K{}))>
decltype(MakeAGridDescriptor_M_K(1, 1, 1)) AGridDesc_M_K
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:346
__host__ static __device__ constexpr auto MakeDefaultAGridDescriptor_AK0_M_AK1(const AGridDesc_M_K &a_grid_desc_m_k)
Definition gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp:174
◆ AGridDesc_M_K
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
| using ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched >::AGridDesc_M_K = decltype(MakeAGridDescriptor_M_K(1, 1, 1)) |
◆ BGridDesc_BK0_N_BK1
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
| using ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched >::BGridDesc_BK0_N_BK1 |
Initial value:
BGridDesc_N_K{}))>
decltype(MakeBGridDescriptor_N_K(1, 1, 1)) BGridDesc_N_K
Definition gridwise_gemm_multiple_d_xdl_cshuffle_lds_direct_load.hpp:347
__host__ static __device__ constexpr auto MakeDefaultBGridDescriptor_BK0_N_BK1(const BGridDesc_N_K &b_grid_desc_n_k)
Definition gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp:190
◆ BGridDesc_N_K
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
| using ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched >::BGridDesc_N_K = decltype(MakeBGridDescriptor_N_K(1, 1, 1)) |
◆ Block2ETileMap
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
| using ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched >::Block2ETileMap = typename GridwiseGemm64::DefaultBlock2ETileMap |
◆ DeviceOp
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
| using ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched >::DeviceOp = DeviceGemmMultipleDMultipleR_Xdl_CShuffle |
◆ EGridDesc_M_N
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
| using ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched >::EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N(1, 1, 1)) |
◆ GridwiseGemm32
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
| using ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched >::GridwiseGemm32 = GridwiseGemmBase<NXdlPerWave32> |
◆ GridwiseGemm64
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
| using ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched >::GridwiseGemm64 = GridwiseGemmBase<math::max(NXdlPerWave64, 1)> |
◆ GridwiseGemmBase
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
template<index_t NXdlPerWave_>
| using ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched >::GridwiseGemmBase |
◆ RGridDesc_M
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
| using ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle< ALayout, BLayout, DELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, QsElementwiseOperation, RsElementwiseOperation, ThreadReduceOperations, RsGlobalMemoryDataOperation, GemmSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, CDEReduceThreadTransferScalarPerVector_NPerBlock, RThreadTransferDstScalarPerVector_MPerBlock, LoopSched >::RGridDesc_M = decltype(MakeRGridDescriptor_M(1)) |
Member Function Documentation
◆ GetTypeString()
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ IsSupportedArgument() [1/2]
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
inlinestatic |
◆ IsSupportedArgument() [2/2]
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
inlineoverridevirtual |
Reimplemented from ck::tensor_operation::device::BaseOperator.
◆ MakeAGridDescriptor_M_K()
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
inlinestatic |
◆ MakeArgument()
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
inlinestatic |
◆ MakeArgumentPointer()
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
inlineoverridevirtual |
◆ MakeBGridDescriptor_N_K()
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
inlinestatic |
◆ MakeEGridDescriptor_M_N()
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
inlinestatic |
◆ MakeInvoker()
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
inlinestatic |
◆ MakeInvokerPointer()
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
inlineoverridevirtual |
◆ MakeRGridDescriptor_M()
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
inlinestatic |
Member Data Documentation
◆ I0
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
staticconstexpr |
◆ I1
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
staticconstexpr |
◆ I2
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
staticconstexpr |
◆ I3
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
staticconstexpr |
◆ matrix_padder
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
staticconstexpr |
Initial value:
=
MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock}
Definition matrix_padder.hpp:180
◆ NumDTensor
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
staticconstexpr |
◆ NumRTensor
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
staticconstexpr |
◆ NXdlPerWave32
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
staticconstexpr |
◆ NXdlPerWave64
template<typename ALayout, typename BLayout, typename DELayout, typename ADataType, typename BDataType, typename GemmAccDataType, typename CShuffleDataType, typename DsDataType, typename EDataType, typename ReduceAccDataType, typename RsDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename ThreadReduceOperations, typename RsGlobalMemoryDataOperation, GemmSpecialization GemmSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t AK1, index_t BK1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock, index_t CDEReduceThreadTransferScalarPerVector_NPerBlock, index_t RThreadTransferDstScalarPerVector_MPerBlock, LoopScheduler LoopSched = make_default_loop_scheduler()>
|
staticconstexpr |
The documentation for this struct was generated from the following file: