ComputeBasePtrOfStridedBatch Struct Reference#
ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, ADataType, BDataType, B1DataType, CDataType, D0sDataType, D1sDataType, GemmAccDataType, CShuffleDataType, AElementwiseOperation, BElementwiseOperation, C0DEElementwiseOperation, B1ElementwiseOperation, C1DEElementwiseOperation, GemmSpec, ASpec, BSpec, B1Spec, CSpec, NumGemmKPrefetchStage, BlockSize, MPerBlock, NPerBlock, KPerBlock, Gemm1NPerBlock, Gemm1KPerBlock, AK1, BK1, B1K1, MPerXDL, NPerXDL, MXdlPerWave, NXdlPerWave, Gemm1NXdlPerWave, ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, BBlockLdsExtraN, B1BlockTransferThreadClusterLengths_BK0_N_BK1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_BK1, B1BlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpec, D0sTransferSrcScalarPerVector, LoopSched >::ComputeBasePtrOfStridedBatch Struct Reference
#include <device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp>
Public Member Functions | |
| ComputeBasePtrOfStridedBatch (const AGridDesc_G_M_K &a_grid_desc_g_m_k, const BGridDesc_G_N_K &b_grid_desc_g_n_k, const B1GridDesc_G_N_K &b1_grid_desc_g_n_k, const C1GridDesc_G_M_N &c1_grid_desc_g_m_n, const D0sGridDesc_G_M_N &d0s_grid_desc_g_m_n) | |
| __host__ __device__ constexpr long_index_t | GetABasePtr (index_t g_idx) const |
| __host__ __device__ constexpr long_index_t | GetBBasePtr (index_t g_idx) const |
| __host__ __device__ constexpr long_index_t | GetB1BasePtr (index_t g_idx) const |
| __host__ __device__ constexpr long_index_t | GetCBasePtr (index_t g_idx) const |
| template<index_t I> | |
| __host__ __device__ constexpr long_index_t | GetD0BasePtr (index_t g_idx, Number< I > d0_idx) const |
Constructor & Destructor Documentation
◆ ComputeBasePtrOfStridedBatch()
template<index_t NumDimG, index_t NumDimM, index_t NumDimN, index_t NumDimK, index_t NumDimO, typename ADataType, typename BDataType, typename B1DataType, typename CDataType, typename D0sDataType, typename D1sDataType, typename GemmAccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename C0DEElementwiseOperation, typename B1ElementwiseOperation, typename C1DEElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization BSpec, TensorSpecialization B1Spec, TensorSpecialization CSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t AK1, index_t BK1, index_t B1K1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, index_t Gemm1NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, int D0sTransferSrcScalarPerVector = 4, LoopScheduler LoopSched = LoopScheduler::Default>
|
inline |
Member Function Documentation
◆ GetABasePtr()
template<index_t NumDimG, index_t NumDimM, index_t NumDimN, index_t NumDimK, index_t NumDimO, typename ADataType, typename BDataType, typename B1DataType, typename CDataType, typename D0sDataType, typename D1sDataType, typename GemmAccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename C0DEElementwiseOperation, typename B1ElementwiseOperation, typename C1DEElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization BSpec, TensorSpecialization B1Spec, TensorSpecialization CSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t AK1, index_t BK1, index_t B1K1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, index_t Gemm1NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, int D0sTransferSrcScalarPerVector = 4, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlineconstexpr |
◆ GetB1BasePtr()
template<index_t NumDimG, index_t NumDimM, index_t NumDimN, index_t NumDimK, index_t NumDimO, typename ADataType, typename BDataType, typename B1DataType, typename CDataType, typename D0sDataType, typename D1sDataType, typename GemmAccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename C0DEElementwiseOperation, typename B1ElementwiseOperation, typename C1DEElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization BSpec, TensorSpecialization B1Spec, TensorSpecialization CSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t AK1, index_t BK1, index_t B1K1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, index_t Gemm1NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, int D0sTransferSrcScalarPerVector = 4, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlineconstexpr |
◆ GetBBasePtr()
template<index_t NumDimG, index_t NumDimM, index_t NumDimN, index_t NumDimK, index_t NumDimO, typename ADataType, typename BDataType, typename B1DataType, typename CDataType, typename D0sDataType, typename D1sDataType, typename GemmAccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename C0DEElementwiseOperation, typename B1ElementwiseOperation, typename C1DEElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization BSpec, TensorSpecialization B1Spec, TensorSpecialization CSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t AK1, index_t BK1, index_t B1K1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, index_t Gemm1NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, int D0sTransferSrcScalarPerVector = 4, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlineconstexpr |
◆ GetCBasePtr()
template<index_t NumDimG, index_t NumDimM, index_t NumDimN, index_t NumDimK, index_t NumDimO, typename ADataType, typename BDataType, typename B1DataType, typename CDataType, typename D0sDataType, typename D1sDataType, typename GemmAccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename C0DEElementwiseOperation, typename B1ElementwiseOperation, typename C1DEElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization BSpec, TensorSpecialization B1Spec, TensorSpecialization CSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t AK1, index_t BK1, index_t B1K1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, index_t Gemm1NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, int D0sTransferSrcScalarPerVector = 4, LoopScheduler LoopSched = LoopScheduler::Default>
|
inlineconstexpr |
◆ GetD0BasePtr()
template<index_t NumDimG, index_t NumDimM, index_t NumDimN, index_t NumDimK, index_t NumDimO, typename ADataType, typename BDataType, typename B1DataType, typename CDataType, typename D0sDataType, typename D1sDataType, typename GemmAccDataType, typename CShuffleDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename C0DEElementwiseOperation, typename B1ElementwiseOperation, typename C1DEElementwiseOperation, GemmSpecialization GemmSpec, TensorSpecialization ASpec, TensorSpecialization BSpec, TensorSpecialization B1Spec, TensorSpecialization CSpec, index_t NumGemmKPrefetchStage, index_t BlockSize, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t Gemm1NPerBlock, index_t Gemm1KPerBlock, index_t AK1, index_t BK1, index_t B1K1, index_t MPerXDL, index_t NPerXDL, index_t MXdlPerWave, index_t NXdlPerWave, index_t Gemm1NXdlPerWave, typename ABlockTransferThreadClusterLengths_AK0_M_AK1, typename ABlockTransferThreadClusterArrangeOrder, typename ABlockTransferSrcAccessOrder, index_t ABlockTransferSrcVectorDim, index_t ABlockTransferSrcScalarPerVector, index_t ABlockTransferDstScalarPerVector_AK1, bool ABlockLdsExtraM, typename BBlockTransferThreadClusterLengths_BK0_N_BK1, typename BBlockTransferThreadClusterArrangeOrder, typename BBlockTransferSrcAccessOrder, index_t BBlockTransferSrcVectorDim, index_t BBlockTransferSrcScalarPerVector, index_t BBlockTransferDstScalarPerVector_BK1, bool BBlockLdsExtraN, typename B1BlockTransferThreadClusterLengths_BK0_N_BK1, typename B1BlockTransferThreadClusterArrangeOrder, typename B1BlockTransferSrcAccessOrder, index_t B1BlockTransferSrcVectorDim, index_t B1BlockTransferSrcScalarPerVector, index_t B1BlockTransferDstScalarPerVector_BK1, bool B1BlockLdsExtraN, index_t CShuffleMXdlPerWavePerShuffle, index_t CShuffleNXdlPerWavePerShuffle, typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, index_t CShuffleBlockTransferScalarPerVector_NPerBlock, MaskingSpecialization MaskingSpec, int D0sTransferSrcScalarPerVector = 4, LoopScheduler LoopSched = LoopScheduler::Default>
template<index_t I>
|
inlineconstexpr |
The documentation for this struct was generated from the following file: