ck Namespace Reference

ck Namespace Reference#

Composable Kernel: ck Namespace Reference
ck Namespace Reference

Namespaces

namespace  utility
namespace  ranges
namespace  utils
namespace  host_common
namespace  literals
namespace  conv_tensor_rearrange_op
namespace  tensor_operation
namespace  tensor_layout
namespace  detail
namespace  fp8_impl
namespace  dpp8
namespace  lds_utils
namespace  debug
namespace  internal
namespace  util
namespace  math
namespace  reduce
namespace  impl
namespace  mathext
namespace  details

Classes

struct  InMemoryDataOperationEnumSequence
struct  StaticTensor
struct  StaticTensorTupleOfVectorBuffer
struct  PassThrough
struct  Pad
struct  LeftPad
struct  RightPad
struct  Embed
struct  Merge_v1_carry_check
struct  lambda_merge_generate_MagicDivision_calculate_magic_multiplier
struct  lambda_merge_generate_MagicDivision_calculate_magic_shift
struct  Merge_v2_magic_division
struct  Merge_v2r2_magic_division
struct  Merge_v3_division_mod
struct  UnMerge
struct  ConvBwdDataImplicitGemmOutTransform
 Transformation struct for convolution backward data output indices to GEMM indices. More...
struct  Freeze
struct  Insert
struct  Vectorize
struct  Slice
struct  Modulo
struct  Xor
struct  TensorAdaptor
struct  TensorCoordinate
struct  TensorCoordinateStep
struct  TensorDescriptor
struct  lambda_get_up_dim_num
struct  SpaceFillingCurve
struct  BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
struct  BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
struct  BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
struct  BlockwiseGemmDpp_ak0mak1_bk0nbk1_m0n0m1n1m2n2
struct  BlockwiseGemmXdlops_mx_pipeline_base
struct  BlockwiseGemmWmmaops_pipeline_hotloop_inst
struct  BlockwiseGemmWmmaops_pipeline_base
struct  BlockwiseGemmWmmaops_pipeline_v1
struct  BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >
struct  BlockwiseGemmWmmaops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >
struct  BlockwiseGemmWmmaops_pipeline_v3
struct  BlockwiseGemmWmmaops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeTypeA, ComputeTypeB, AccDataType, AWmmaTileDesc, BWmmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, MRepeat, NRepeat, KPack, TransposeC >
struct  BlockwiseGemmXdlops_pipeline_hotloop_inst
struct  BlockwiseGemmXdlops_pipeline_v4
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_bdequant_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v1
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v1< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v1
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v2
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v2< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v3
struct  BlockwiseGemmXdlops_pipeline_bpreshuffle_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_base
struct  BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1
struct  BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3
struct  BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3
struct  BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MScaleBlock, NScaleBlock, KScaleBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3
struct  BlockwiseGemmXdlops_pipeline_mx_moe_bns_gufusion_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v1
struct  BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v1< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3
struct  BlockwiseGemmXdlops_pipeline_mx_moe_nbs_v3< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v1
struct  BlockwiseGemmXdlops_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v1< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlopsDirectLoad_pipeline_v1
struct  BlockwiseGemmXdlopsDirectLoad_pipeline_v1< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v1_ab_scale
struct  BlockwiseGemmXdlops_pipeline_v1_ab_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v1_b_scale
struct  BlockwiseGemmXdlops_pipeline_v1_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v1_mx
struct  BlockwiseGemmXdlops_pipeline_v1_mx< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v2
struct  BlockwiseGemmXdlops_pipeline_v2< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v2< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v2_ab_scale
struct  BlockwiseGemmXdlops_pipeline_v2_ab_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v2_b_scale
struct  BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v2_b_scale< BlockGemmPipelineScheduler::Interwave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v3
struct  BlockwiseGemmXdlops_pipeline_v3< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v3_ab_scale
struct  BlockwiseGemmXdlops_pipeline_v3_ab_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v3_b_scale
struct  BlockwiseGemmXdlops_pipeline_v3_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v3_mx
struct  BlockwiseGemmXdlops_pipeline_v3_mx< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle
struct  BlockwiseGemmXdlops_pipeline_v3_mx_bprehuffle< BlockGemmPipelineScheduler::Intrawave, ThreadBlockSize, ScaleBlockSize, ADataType, AScaleDataType, BDataType, BScaleDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlopsDirectLoad_pipeline_v4
struct  BlockwiseGemmXdlopsDirectLoad_pipeline_v4< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v4_b_scale
struct  BlockwiseGemmXdlops_pipeline_v4_b_scale< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_pipeline_v5
struct  BlockwiseGemmXdlops_pipeline_v5< BlockGemmPipelineScheduler::Intrawave, BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack >
struct  BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
struct  BlockwiseGemmWMMA
struct  BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
struct  BlockwiseGemmXdlops_v2
 Blockwise gemm. More...
struct  BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1r1
struct  BlockwiseSoftmax
 Blockwise softmax. More...
struct  BlockwiseTensorSliceTransfer_v5r1
struct  BlockwiseWelford
struct  PartitionedBlockwiseReduction
struct  PartitionedBlockwiseReduction_v2
struct  PartitionedBlockwiseReductionWithIndex
struct  ThreadGroupTensorSliceTransfer_DirectLoad
struct  ThreadGroupTensorSliceTransfer_Gather_DirectLoad
struct  ThreadGroupTransferGlobal
struct  ThreadGroupTensorSliceTransfer_v4r1
 Blockwise data transfer. More...
struct  ThreadGroupTensorSliceTransfer_v4r1_dequant
 Blockwise data transfer with dequantization. More...
struct  ThreadGroupTensorSliceTransfer_v4r1_gather
 Blockwise data transfer. More...
struct  ThreadGroupTensorSliceTransfer_v4r2
 Blockwise data transfer. More...
struct  ThreadGroupTensorSliceTransfer_v6r1
struct  ThreadGroupTensorSliceTransfer_v6r1r2
struct  ThreadGroupTensorSliceTransfer_v6r2
struct  ThreadGroupTensorSliceTransfer_v6r3
struct  ThreadGroupTensorSliceTransfer_v7
struct  ThreadGroupTensorSliceTransfer_v7r2
struct  ThreadGroupTensorSliceTransfer_v7r3
struct  ThreadGroupTensorSliceTransfer_v7r3_scatter
struct  reduce_binary_operator
struct  reduce_binary_operator< ReduceTensorOp::ADD >
struct  reduce_binary_operator< ReduceTensorOp::MUL >
struct  reduce_binary_operator< ReduceTensorOp::MIN >
struct  reduce_binary_operator< ReduceTensorOp::MAX >
struct  reduce_binary_operator< ReduceTensorOp::AMAX >
struct  reduce_binary_operator< ReduceTensorOp::AVG >
struct  reduce_binary_operator< ReduceTensorOp::NORM1 >
struct  reduce_binary_operator< ReduceTensorOp::NORM2 >
struct  reduce_unary_operator
struct  reduce_unary_operator< ReduceTensorOp::AVG, IsFirstReduce, true >
struct  reduce_unary_operator< ReduceTensorOp::NORM1, true, IsLastReduce >
struct  reduce_unary_operator< ReduceTensorOp::AMAX, true, IsLastReduce >
struct  reduce_unary_operator< ReduceTensorOp::NORM2, true, false >
struct  reduce_unary_operator< ReduceTensorOp::NORM2, true, true >
struct  reduce_unary_operator< ReduceTensorOp::NORM2, false, true >
struct  GridwiseMultiblockBatchNormForward
struct  GridwiseReduceSecondHalfBatchNormBackwardFinal
struct  GridwiseMultiblockWelfordFirstHalf
struct  GridwiseWelfordSecondHalfBatchNormForwardFinal
struct  GridwiseWelfordSecondHalfReduceFirstHalf
struct  BlockToCTileMap_M00_N0_M01
struct  BlockToCTileMap_M00_N0_M01Adapt
struct  BlockToCTileMap_M00_N0_M01Adapt< MPerBlock, NPerBlock, void >
struct  BlockToCTileMap_Grouped_M00_N0_M01Adapt
struct  BlockToCTileMap_N00_M0_N01Adapt
struct  BlockToCTileMap_N00_M0_N01Adapt< MPerBlock, NPerBlock, void >
struct  BlockToCTileMap_KSplit_M00_N0_M01Adapt
struct  BlockToCTileMap_M00_N00_M01_N01
struct  BlockToCTileMap_KSplit_M00_N00_M01_N01
struct  OffsettedBlockToCTileMap
struct  OffsettedBlockToCTileMap2
struct  BlockToCTileMap_3DGrid_KSplit
 Simple tile mapping which creates 3D grid of block of threads. More...
struct  BlockToCTileMap_GemmStreamK
struct  BlockToCTileMap_GemmStreamK_v2
struct  EpilogueWelfordCShuffle
struct  EpilogueCShuffle
struct  EpilogueCShuffleBase
struct  GridwiseGemmMultipleDWelfordFirstHalf_xdl_cshuffle
struct  GridwiseWelfordSecondHalfLayernorm2d
struct  GridwiseMultipleReduction_mk_to_m_multiblock
struct  GridwiseMultipleReduction_mk_to_m_threadwise
struct  GridwiseReduction_mk_to_m_multiblock
struct  GridwiseReduction_mk_to_m_threadwise
struct  GridwiseReduction_mk_to_m_threadwise_multi_d
struct  ABTransferThreadTiles
struct  ABTransferWaveTiles
struct  GridwiseBatchedGemmGemm_wmma_cshuffle_v3
struct  GridwiseBatchedGemmGemm_Xdl_CShuffle
struct  GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
struct  GridwiseBatchedGemmMultipleDSoftmaxGemm_Xdl_CShuffle
struct  GridwiseBatchedGemmSoftmaxGemm_Wmma
struct  GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
 Gridwise gemm + softmax + gemm fusion. More...
struct  GridwiseBatchNormBackwardWithBlockwiseWelford
struct  GridwiseBatchNormForwardWithBlockwiseWelford
struct  GridwiseElementwise_1D
struct  GridwiseElementwise
struct  GridwiseElementwiseLayernormWelfordVariance_mk_to_mk
struct  GridwiseFpAintBGemm_Wmma
struct  GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
struct  GridwiseGemmDlMultipleD_km_kn_mn
struct  GridwiseGemmDl_km_kn_mn_v1r3
struct  GridwiseGemmDl_bkm_bkn_mn_v1r3
struct  GridwiseGemm_ak0mak1_bk0nbk1_mn_dpp
struct  GridwiseGemmMultipleABD_xdl_cshuffle
struct  GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1
struct  GridwiseGemmMultipleD_Wmma
struct  GridwiseGemmMultipleD_xdl_cshuffle
struct  GridwiseGemmMultipleD_Xdl_CShuffle_LdsDirectLoad
struct  GridwiseGemmMultipleD_xdl_splitk_cshuffle
struct  GridwiseGemmPipeline_v1
struct  GridwiseGemmPipeline_v1< 1, true, true >
struct  GridwiseGemmPipeline_v1< 2, true, true >
struct  GridwiseGemmPipeline_v1< 1, false, true >
struct  GridwiseGemmPipeline_v1< 1, true, false >
struct  GridwiseGemmPipeline_v1< 1, false, false >
struct  GridwiseGemmPipeline_v1_WeightOnly
struct  GridwiseGemmPipeline_v1_WeightOnly< 1, true, true >
struct  GridwiseGemmPipelineInterwave_v1
struct  GridwiseGemmPipelineInterwave_v1< 1 >
struct  GridwiseGemmPipelineInterwave_v1< 2 >
struct  GridwiseGemmPipeline_v2
struct  GridwiseGemmPipeline_v3
struct  GridwiseGemmPipeline_v4
struct  GridwiseGemmPipeline_v4< 1 >
struct  GridwiseGemmPipeline_v4< 2 >
struct  GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
struct  GridwiseGemmSplitKMultipleD_xdl_cshuffle
struct  GridwiseGemmLoadWave
struct  GridwiseGemmLoadWave< TileLoadThreadGroup, 1 >
struct  GridwiseGemmMathWave
struct  GridwiseGemmMathWave< TileMathThreadGroup, 1 >
struct  GridwiseGemm_Wmma
struct  GridwiseGemm_wmma_cshuffle_v3
 "Universal" GEMM kernel with SplitK support. More...
struct  GridwiseGemm_wmma_cshuffle_v3_b_scale
struct  GridwiseGemm_wmma_cshuffle_v3_base
struct  GridwiseGemm_xdl_cshuffle_conv_v3
struct  GridwiseGemm_xdl_cshuffle_streamk_v3
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
struct  GridwiseGemm_xdl_cshuffle_v2
struct  GridwiseGemm_xdl_cshuffle_v3
 "Universal" GEMM kernel with SplitK support. More...
struct  GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
struct  GridwiseGemmMultiD_xdl_cshuffle_v3
struct  GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
struct  GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
struct  GridwiseGemmMultiD_blockscale_xdl_cshuffle_v3_b_preshuffle
struct  GridwiseGemmMX_xdl_cshuffle_v3
struct  GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
struct  GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdl_waveletmodel_cshuffle
struct  Merge_v4_no_carry
struct  GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
struct  GridwiseGemm_xdlops_splitk_lds_direct_load
struct  GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_streamk
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3_ext
struct  GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
struct  GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
struct  GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
struct  GridwiseMoeGemm
struct  GridwiseMoeGemmBlockScale
struct  GridwiseMoeGemmMX
struct  GridwiseMoeGemmMXBNS
struct  GridwiseMoeGemmMX_BPreshuffle
struct  GridwisePermute
struct  GridwisePutElement_1D
struct  GridwiseSoftmax_mk_to_mk
struct  GridwiseSparseEmbeddingsForwardLayernorm
struct  GridwiseTensorRearrange
struct  GridwiseNormalizationBwdData_mk_to_mk
struct  GridwiseNormalizationBwdGammaBeta_mk_to_k
struct  GridwiseNormalizationNaiveVariance_mk_to_mk
struct  GridwiseNormalizationSplitK1st
struct  GridwiseNormalizationSplitK2nd
struct  GridwiseNormalizationWelfordVariance_mk_to_mk
struct  ThreadwiseReduction
struct  ThreadwiseReductionWithIndex
struct  ThreadwiseGemmDl_km0m1_kn0n1_m0m1n0n1
struct  ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
struct  ThreadwiseGemmDlops_km_kn_mn_v3
struct  ThreadwiseTensorSliceSet_v1
struct  ThreadwiseTensorSliceTransfer_v1r3
struct  ThreadwiseTensorSliceTransfer_v2
 Helper structure that facilitates transfer of source (grid) data to destination threads. More...
struct  ThreadwiseTensorSliceTransfer_v2_gather
struct  ThreadwiseTensorSliceTransfer_v3
struct  ThreadwiseTensorSliceTransfer_v4
struct  ThreadwiseTensorSliceTransfer_StaticToStatic
 Threadwise data transfer. More...
struct  ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
struct  ThreadwiseTensorSliceTransfer_StaticToStatic_IntraRow
struct  ThreadwiseTensorSliceTransfer_v3r1
struct  ThreadwiseTensorSliceTransfer_v3r1_dequant
struct  ThreadwiseTensorSliceTransfer_v3r1_gather
struct  ThreadwiseTensorSliceTransfer_v3r2
struct  ThreadwiseTensorSliceTransfer_v4r1
struct  ThreadwiseTensorSliceTransfer_v5r1
struct  ThreadwiseTensorSliceTransfer_v6r1
struct  ThreadwiseTensorSliceTransfer_v6r1r2
struct  ThreadwiseTensorSliceTransfer_v6r2
struct  ThreadwiseTensorSliceTransfer_v6r3
struct  ThreadwiseTensorSliceTransfer_v7
struct  ThreadwiseTensorSliceTransfer_v7r2
struct  ThreadwiseTensorSliceTransfer_v7r3
struct  ThreadwiseTensorSliceTransfer_v7r3_scatter
struct  ThreadwiseWelford
struct  ThreadwiseWelfordMerge
struct  dpp_type
struct  dpp_type< DppInstr::dpp8_f16_32x8x2 >
struct  dpp_type< DppInstr::dpp8_f16_8x32x2 >
struct  dpp_type< DppInstr::dpp8_f16_8x16x2 >
struct  dpp_type< DppInstr::dpp8_f16_16x16x2 >
struct  dpp_type< DppInstr::dpp8_f16_4x32x2 >
struct  dpp_type< DppInstr::dpp8_f16_4x16x2 >
struct  dpp_type< DppInstr::dpp8_f16_1x32x2 >
struct  dpp_type< DppInstr::dpp8_f16_2x32x2 >
struct  dpp_type< DppInstr::dpp8_f16_2x16x2 >
struct  DppSelector
struct  DppGemm
struct  smfmac_type
struct  smfmac< SmfmacInstr::smfmac_f32_16x16x32f16 >
struct  smfmac< SmfmacInstr::smfmac_f32_32x32x16f16 >
struct  smfmac< SmfmacInstr::smfmac_f32_16x16x32bf16 >
struct  smfmac< SmfmacInstr::smfmac_f32_32x32x16bf16 >
struct  SmfmacSelector
struct  SparseXdlopsGemm
struct  wmma_type
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_f16, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_bf16, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
struct  wmma_type< WmmaInstr::wmma_f16_16x16x16_f16, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
struct  wmma_type< WmmaInstr::wmma_bf16_16x16x16_bf16, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
struct  wmma_type< WmmaInstr::wmma_i32_16x16x16_iu8, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_f16_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_bf16_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
struct  wmma_type< WmmaInstr::wmma_i32_16x16x16_iu8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_f8f8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_f8bf8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_bf8f8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
struct  wmma_type< WmmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12, WaveSize, typename std::enable_if_t< WaveSize==32||WaveSize==64 > >
struct  WmmaSelector
struct  WmmaGemm
struct  mfma_type
struct  mfma_type< MfmaInstr::mfma_f32_32x32x1f32 >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x2f32 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x4f32 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x1f32 >
struct  mfma_type< MfmaInstr::mfma_f32_4x4x1f32 >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x4f16 >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x8f16 >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16f16 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32f16 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x16f16 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x4f16 >
struct  mfma_type< MfmaInstr::mfma_f32_4x4x4f16 >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16bf16 >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x8bf16_1k >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32bf16 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x16bf16_1k >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x4bf16 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x8bf16 >
struct  mfma_type< MfmaInstr::mfma_i32_32x32x8i8 >
struct  mfma_type< MfmaInstr::mfma_i32_16x16x16i8 >
struct  mfma_type< MfmaInstr::mfma_i32_32x32x16i8 >
struct  mfma_type< MfmaInstr::mfma_i32_16x16x32i8 >
struct  mfma_type< MfmaInstr::mfma_i32_32x32x32i8 >
struct  mfma_type< MfmaInstr::mfma_i32_16x16x64i8 >
struct  mfma_type< MfmaInstr::mfma_f64_16x16x4f64 >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16f8f8 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32f8f8 >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16bf8bf8 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32bf8bf8 >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16f8bf8 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32f8bf8 >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x16bf8f8 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x32bf8f8 >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x64f8f6f4 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x128f8f6f4 >
struct  mfma_type< MfmaInstr::mfma_scale_f32_32x32x64f8f6f4 >
struct  mfma_type< MfmaInstr::mfma_scale_f32_16x16x128f8f6f4 >
struct  mfma_type< MfmaInstr::mfma_f32_16x16x8xf32 >
struct  mfma_type< MfmaInstr::mfma_f32_32x32x4xf32 >
struct  mfma_type_gfx11_base
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_f16 >
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_bf16 >
struct  mfma_type< MfmaInstr::wmma_i32_16x16x16_iu8 >
struct  mfma_type< MfmaInstr::wmma_unsupport_16x16_gfx11 >
struct  mfma_type_gfx12_base
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_f16_gfx12 >
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_bf16_gfx12 >
struct  mfma_type< MfmaInstr::wmma_i32_16x16x16_iu8_gfx12 >
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_f8f8_gfx12 >
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_f8bf8_gfx12 >
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_bf8f8_gfx12 >
struct  mfma_type< MfmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12 >
struct  mfma_type< MfmaInstr::wmma_unsupport_16x16_gfx12 >
class  MfmaSelector
 Selects the appropriate MFMA instruction type and configuration for given data types and tile sizes on AMD GPUs. More...
struct  XdlopsGemm
union  BufferResource
struct  f8_fnuz_t
struct  bf8_fnuz_t
struct  f8_ocp_t
struct  bf8_ocp_t
struct  intrin_smfmac_f32_16x16x32f16
struct  intrin_smfmac_f32_16x16x32f16< 16, 16 >
struct  intrin_smfmac_f32_16x16x32bf16
struct  intrin_smfmac_f32_16x16x32bf16< 16, 16 >
struct  intrin_smfmac_f32_32x32x16f16
struct  intrin_smfmac_f32_32x32x16f16< 32, 32 >
struct  intrin_smfmac_f32_32x32x16bf16
struct  intrin_smfmac_f32_32x32x16bf16< 32, 32 >
struct  intrin_wmma_f32_16x16x16_f16_w32
struct  intrin_wmma_f32_16x16x16_f16_w32< 16, 16 >
struct  intrin_wmma_f32_16x16x16_bf16_w32
struct  intrin_wmma_f32_16x16x16_bf16_w32< 16, 16 >
struct  intrin_wmma_f16_16x16x16_f16_w32
struct  intrin_wmma_f16_16x16x16_f16_w32< 16, 16, Opsel >
struct  intrin_wmma_bf16_16x16x16_bf16_w32
struct  intrin_wmma_bf16_16x16x16_bf16_w32< 16, 16, Opsel >
struct  intrin_wmma_i32_16x16x16_iu8_w32
struct  intrin_wmma_i32_16x16x16_iu8_w32< 16, 16, neg_a, neg_b, clamp >
struct  intrin_wmma_f32_16x16x16_f16_w64
struct  intrin_wmma_f32_16x16x16_f16_w64< 16, 16 >
struct  intrin_wmma_f32_16x16x16_bf16_w64
struct  intrin_wmma_f32_16x16x16_bf16_w64< 16, 16 >
struct  intrin_wmma_f16_16x16x16_f16_w64
struct  intrin_wmma_f16_16x16x16_f16_w64< 16, 16, Opsel >
struct  intrin_wmma_bf16_16x16x16_bf16_w64
struct  intrin_wmma_bf16_16x16x16_bf16_w64< 16, 16, Opsel >
struct  intrin_wmma_i32_16x16x16_iu8_w64
struct  intrin_wmma_i32_16x16x16_iu8_w64< 16, 16, neg_a, neg_b, clamp >
struct  intrin_wmma_f32_16x16x16_f16_w32_gfx12
struct  intrin_wmma_f32_16x16x16_f16_w32_gfx12< 16, 16 >
struct  intrin_wmma_f32_16x16x16_bf16_w32_gfx12
struct  intrin_wmma_f32_16x16x16_bf16_w32_gfx12< 16, 16 >
struct  intrin_wmma_i32_16x16x16_iu8_w32_gfx12
struct  intrin_wmma_i32_16x16x16_iu8_w32_gfx12< 16, 16, neg_a, neg_b, clamp >
struct  intrin_wmma_f32_16x16x16_f8f8_w32_gfx12
struct  intrin_wmma_f32_16x16x16_f8f8_w32_gfx12< 16, 16 >
struct  intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12
struct  intrin_wmma_f32_16x16x16_f8bf8_w32_gfx12< 16, 16 >
struct  intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12
struct  intrin_wmma_f32_16x16x16_bf8f8_w32_gfx12< 16, 16 >
struct  intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12
struct  intrin_wmma_f32_16x16x16_bf8bf8_w32_gfx12< 16, 16 >
struct  intrin_mfma_f32_32x32x1f32
struct  intrin_mfma_f32_32x32x1f32< 64, 64 >
struct  intrin_mfma_f32_32x32x1f32< 32, 64 >
struct  intrin_mfma_f32_32x32x2f32
struct  intrin_mfma_f32_32x32x2f32< 32, 32 >
struct  intrin_mfma_f32_16x16x4f32
struct  intrin_mfma_f32_16x16x4f32< 16, 16 >
struct  intrin_mfma_f32_16x16x1f32
struct  intrin_mfma_f32_16x16x1f32< 16, 64 >
struct  intrin_mfma_f32_4x4x1f32
struct  intrin_mfma_f32_4x4x1f32< 4, 64 >
struct  intrin_mfma_f32_4x4x1f32< 8, 64 >
struct  intrin_mfma_f32_32x32x4f16
struct  intrin_mfma_f32_32x32x4f16< 64, 64 >
struct  intrin_mfma_f32_32x32x4f16< 32, 64 >
struct  intrin_mfma_f32_32x32x16f16
struct  intrin_mfma_f32_32x32x16f16< 32, 32 >
struct  intrin_mfma_f32_16x16x32f16
struct  intrin_mfma_f32_16x16x32f16< 16, 16 >
struct  intrin_mfma_f32_32x32x8f16
struct  intrin_mfma_f32_32x32x8f16< 32, 32 >
struct  intrin_mfma_f32_16x16x16f16
struct  intrin_mfma_f32_16x16x16f16< 16, 16 >
struct  intrin_mfma_f32_16x16x4f16
struct  intrin_mfma_f32_16x16x4f16< 16, 64 >
struct  intrin_mfma_f32_4x4x4f16
struct  intrin_mfma_f32_4x4x4f16< 4, 64 >
struct  intrin_mfma_f32_4x4x4f16< 8, 64 >
struct  intrin_mfma_f32_32x32x16bf16
struct  intrin_mfma_f32_32x32x16bf16< 32, 32 >
struct  intrin_mfma_f32_16x16x32bf16
struct  intrin_mfma_f32_16x16x32bf16< 16, 16 >
struct  intrin_mfma_f32_32x32x8bf16_1k
struct  intrin_mfma_f32_32x32x8bf16_1k< 32, 32 >
struct  intrin_mfma_f32_16x16x16bf16_1k
struct  intrin_mfma_f32_16x16x16bf16_1k< 16, 16 >
struct  intrin_mfma_f32_32x32x4bf16
struct  intrin_mfma_f32_32x32x4bf16< 32, 32 >
struct  intrin_mfma_f32_16x16x8bf16
struct  intrin_mfma_f32_16x16x8bf16< 16, 16 >
struct  intrin_mfma_i32_32x32x8i8
struct  intrin_mfma_i32_32x32x8i8< 32, 32 >
struct  intrin_mfma_i32_16x16x16i8
struct  intrin_mfma_i32_16x16x16i8< 16, 16 >
struct  intrin_mfma_i32_32x32x32i8
struct  intrin_mfma_i32_32x32x32i8< 32, 32 >
struct  intrin_mfma_i32_16x16x64i8
struct  intrin_mfma_i32_16x16x64i8< 16, 16 >
struct  intrin_mfma_i32_32x32x16i8
struct  intrin_mfma_i32_32x32x16i8< 32, 32 >
struct  intrin_mfma_i32_16x16x32i8
struct  intrin_mfma_i32_16x16x32i8< 16, 16 >
struct  intrin_mfma_f64_16x16x4f64
struct  intrin_mfma_f64_16x16x4f64< 16, 16 >
struct  intrin_mfma_f32_32x32x64f8f6f4
struct  intrin_mfma_f32_32x32x64f8f6f4< 32, 32 >
 Performs a matrix fused multiply-accumulate operation on 32x32x64 submatrices for f8, f6, and f4 data types. More...
struct  intrin_mfma_scale_f32_32x32x64f8f6f4
struct  intrin_mfma_scale_f32_32x32x64f8f6f4< 32, 32, OpselA, OpselB >
struct  intrin_mfma_scale_f32_16x16x128f8f6f4
struct  intrin_mfma_scale_f32_16x16x128f8f6f4< 16, 16, OpselA, OpselB >
struct  intrin_mfma_f32_16x16x128f8f6f4
struct  intrin_mfma_f32_16x16x128f8f6f4< 16, 16 >
 Performs a matrix fused multiply-accumulate operation on 16x16x128 submatrices for f8f6f4 data types. More...
struct  intrin_mfma_f32_32x32x16f8f8
struct  intrin_mfma_f32_32x32x16f8f8< 32, 32 >
struct  intrin_mfma_f32_16x16x32f8f8
struct  intrin_mfma_f32_16x16x32f8f8< 16, 16 >
struct  intrin_mfma_f32_32x32x16bf8bf8
struct  intrin_mfma_f32_32x32x16bf8bf8< 32, 32 >
struct  intrin_mfma_f32_16x16x32bf8bf8
struct  intrin_mfma_f32_16x16x32bf8bf8< 16, 16 >
struct  intrin_mfma_f32_32x32x16f8bf8
struct  intrin_mfma_f32_32x32x16f8bf8< 32, 32 >
struct  intrin_mfma_f32_16x16x32f8bf8
struct  intrin_mfma_f32_16x16x32f8bf8< 16, 16 >
struct  intrin_mfma_f32_32x32x16bf8f8
struct  intrin_mfma_f32_32x32x16bf8f8< 32, 32 >
struct  intrin_mfma_f32_16x16x32bf8f8
struct  intrin_mfma_f32_16x16x32bf8f8< 16, 16 >
struct  intrin_mfma_f32_16x16x8xf32
struct  intrin_mfma_f32_16x16x8xf32< 16, 16 >
struct  intrin_mfma_f32_32x32x4xf32
struct  intrin_mfma_f32_32x32x4xf32< 32, 32 >
struct  Array
struct  Array< TData, 0 >
struct  ContainerElementPicker
struct  ConstantContainerElementPicker
struct  scalar_type
struct  f4x2_pk_t
struct  f6_pk_t
struct  pk_i4_t
struct  is_scalar_type
struct  scalar_type< T >
struct  scalar_type< double >
struct  scalar_type< float >
struct  scalar_type< half_t >
struct  scalar_type< bhalf_t >
struct  scalar_type< int32_t >
struct  scalar_type< int8_t >
struct  scalar_type< uint8_t >
struct  scalar_type< pk_i4_t >
struct  scalar_type< f8_fnuz_t >
struct  scalar_type< bf8_fnuz_t >
struct  scalar_type< f8_ocp_t >
struct  scalar_type< bf8_ocp_t >
struct  scalar_type< e8m0_bexp_t >
struct  scalar_type< f4x2_pk_t >
struct  scalar_type< f6x32_pk_t >
struct  scalar_type< bf6x32_pk_t >
struct  scalar_type< f6x16_pk_t >
struct  scalar_type< bf6x16_pk_t >
struct  scalar_type< bool >
struct  packed_type_info
struct  packed_type_maker
struct  vector_type
struct  vector_type_maker
struct  scalar_type< vector_type< T, N > >
struct  vector_type_maker< T, N0 >
struct  vector_type_maker< vector_type< T, N1 >, N0 >
struct  non_native_vector_base
struct  nnvb_data_t_selector
struct  nnvb_data_t_selector< f8_ocp_t >
struct  nnvb_data_t_selector< bf8_ocp_t >
struct  nnvb_data_t_selector< f8_fnuz_t >
struct  nnvb_data_t_selector< bf8_fnuz_t >
struct  nnvb_data_t_selector< e8m0_bexp_t >
struct  nnvb_data_t_selector< f6x16_pk_t >
struct  nnvb_data_t_selector< f6x32_pk_t >
struct  nnvb_data_t_selector< bf6x16_pk_t >
struct  nnvb_data_t_selector< bf6x32_pk_t >
struct  nnvb_data_t_selector< pk_i4_t >
struct  nnvb_data_t_selector< f4x2_pk_t >
struct  non_native_vector_base< T, N, ck::enable_if_t< sizeof(T)==1||sizeof(T)==2||sizeof(T)==4||sizeof(T)==8 > >
struct  non_native_vector_base< T, N, ck::enable_if_t< sizeof(T)==12||sizeof(T)==16||sizeof(T)==24||sizeof(T)==32 > >
struct  scalar_type< non_native_vector_base< T, N, ck::enable_if_t< sizeof(T)==1||sizeof(T)==2||sizeof(T)==4||sizeof(T)==8 > > >
struct  scalar_type< non_native_vector_base< T, N, ck::enable_if_t< sizeof(T)==12||sizeof(T)==16||sizeof(T)==24||sizeof(T)==32 > > >
struct  DynamicBuffer
struct  e8m0_bexp_t
 Unsigned representation of a conventional biased Float32 exponent. More...
struct  forwarder
struct  swallow
struct  logical_and
struct  logical_or
struct  logical_not
struct  static_if
struct  static_if< true >
struct  static_if< false >
struct  conditional
struct  conditional< true, X, Y >
struct  conditional< false, X, Y >
struct  static_for
struct  static_for< 0, N, 1 >
struct  static_for_range
struct  static_for_product
struct  static_for_product< Tuple< Is... > >
struct  static_for_product< Tuple< Is... >, Rest... >
struct  identity
struct  static_ford
struct  ford
struct  constant
struct  integral_constant
struct  nonesuch
struct  is_known_at_compile_time
struct  is_known_at_compile_time< index_t >
struct  is_known_at_compile_time< unsigned int >
struct  is_known_at_compile_time< long_index_t >
struct  is_known_at_compile_time< integral_constant< T, X > >
struct  is_known_at_compile_time< Sequence< Is... > >
struct  is_known_at_compile_time< Tuple< Ts... > >
struct  MagicDivision
struct  MDiv
struct  MDiv2
struct  NumericLimits
struct  NumericLimits< half_t >
struct  NumericLimits< f8_fnuz_t >
struct  NumericLimits< bf8_fnuz_t >
struct  NumericLimits< f8_ocp_t >
struct  NumericLimits< bf8_ocp_t >
struct  NumericLimits< f4_t >
struct  NumericLimits< f6_t >
struct  NumericLimits< bf6_t >
struct  NumericLimits< e8m0_bexp_t >
struct  NumericUtils
struct  NumericUtils< e8m0_bexp_t >
struct  NumericUtils< float >
struct  NumericUtils< ck::tf32_t >
struct  NumericUtils< half_t >
struct  NumericUtils< bhalf_t >
struct  NumericUtils< f8_fnuz_t >
struct  NumericUtils< bf8_fnuz_t >
struct  NumericUtils< f8_ocp_t >
struct  NumericUtils< bf8_ocp_t >
struct  NumericUtils< f4_t >
struct  NumericUtils< f6_t >
struct  NumericUtils< bf6_t >
struct  float_equal_one
struct  float_equal_zero
struct  Sequence
struct  sequence_split
struct  sequence_reverse
struct  sequence_map_inverse
struct  is_valid_sequence_map
struct  sequence_merge
struct  sequence_merge< Sequence< Xs... >, Sequence< Ys... > >
struct  sequence_merge< Seq >
struct  sequence_gen
struct  arithmetic_sequence_gen
struct  arithmetic_sequence_gen< 0, IEnd, 1 >
struct  uniform_sequence_gen
struct  sequence_reverse_inclusive_scan
struct  sequence_reverse_inclusive_scan< Sequence< I, Is... >, Reduce, Init >
struct  sequence_reverse_inclusive_scan< Sequence< I >, Reduce, Init >
struct  sequence_reverse_inclusive_scan< Sequence<>, Reduce, Init >
struct  sequence_reverse< Sequence< I > >
struct  sequence_reverse< Sequence< I0, I1 > >
struct  sequence_reduce
struct  sequence_reduce< Reduce, Sequence< Xs... >, Sequence< Ys... > >
struct  sequence_reduce< Reduce, Seq >
struct  sequence_sort_impl
struct  sequence_sort_impl< Sequence< ValueX, ValueY >, Sequence< IdX, IdY >, Compare >
struct  sequence_sort_impl< Sequence< Value >, Sequence< Id >, Compare >
struct  sequence_sort_impl< Sequence<>, Sequence<>, Compare >
struct  sequence_sort
struct  sequence_unique_sort
class  span
struct  StaticBuffer
struct  StaticBufferTupleOfVector
struct  StaticallyIndexedArray_v2
struct  ThisThreadBlock
struct  transpose_vectors
struct  transpose_vectors< half_t, NX, NY >
struct  transpose_vectors< int8_t, NX, NY >
struct  transpose_vectors< f8_t, NX, NY >
struct  Tuple
struct  Tuple<>
struct  tuple_element
struct  is_same
struct  is_same< X, X >
struct  is_floating_point
struct  is_floating_point< float >
struct  is_floating_point< double >
struct  is_floating_point< long double >
struct  is_integral
struct  is_integral< int >
struct  is_integral< unsigned int >
struct  is_integral< long >
struct  is_integral< unsigned long >
struct  is_integral< short >
struct  is_integral< unsigned short >
struct  is_integral< long long >
struct  is_integral< unsigned long long >
struct  is_integral< char >
struct  is_integral< signed char >
struct  is_integral< unsigned char >
struct  is_integral< wchar_t >
struct  is_integral< char16_t >
struct  is_integral< char32_t >
struct  is_integral< bool >
struct  workgroup_barrier

Typedefs

using index_t = int32_t
using long_index_t = int64_t
template<typename T>
using iter_value_t = typename std::iterator_traits<remove_cvref_t<T>>::value_type
template<typename T>
using iter_reference_t = decltype(*std::declval<T&>())
template<typename T>
using iter_difference_t = typename std::iterator_traits<remove_cvref_t<T>>::difference_type
template<typename TensorDesc>
using TensorCoordinate_t
template<typename TensorDesc>
using TensorCoordinateStep_t
typedef unsigned char fp8_storage_t
using f8_t = f8_fnuz_t
using bf8_t = bf8_fnuz_t
template<index_t N>
using MultiIndex = Array<index_t, N>
using tf32_t = _BitInt(19)
using bhalf_t = ushort
using half_t = _Float16
using int4_t = _BitInt(4)
using f4_t = unsigned _BitInt(4)
using f6_t = _BitInt(6)
using bf6_t = unsigned _BitInt(6)
using f6x16_pk_t = f6_pk_t<f6_t, 16>
using f6x32_pk_t = f6_pk_t<f6_t, 32>
using bf6x16_pk_t = f6_pk_t<bf6_t, 16>
using bf6x32_pk_t = f6_pk_t<bf6_t, 32>
template<typename X, typename Y>
using has_same_scalar_type
template<typename T>
using element_type_t = typename packed_type_info<T>::element_type
template<typename T, index_t N = 0>
using packed_type_t = typename packed_type_maker<T, N>::packed_type
using int64_t = long
using double2_t = typename vector_type<double, 2>::type
using double4_t = typename vector_type<double, 4>::type
template<typename T, index_t N>
using vector_type_maker_t = typename vector_type_maker<T, N>::type
using float2_t = typename vector_type<float, 2>::type
using float4_t = typename vector_type<float, 4>::type
using float8_t = typename vector_type<float, 8>::type
using float16_t = typename vector_type<float, 16>::type
using float32_t = typename vector_type<float, 32>::type
using float64_t = typename vector_type<float, 64>::type
using half2_t = typename vector_type<half_t, 2>::type
using half4_t = typename vector_type<half_t, 4>::type
using half8_t = typename vector_type<half_t, 8>::type
using half16_t = typename vector_type<half_t, 16>::type
using half32_t = typename vector_type<half_t, 32>::type
using bhalf2_t = typename vector_type<bhalf_t, 2>::type
using bhalf4_t = typename vector_type<bhalf_t, 4>::type
using bhalf8_t = typename vector_type<bhalf_t, 8>::type
using bhalf16_t = typename vector_type<bhalf_t, 16>::type
using bhalf32_t = typename vector_type<bhalf_t, 32>::type
using int32x2_t = typename vector_type<int32_t, 2>::type
using int32x4_t = typename vector_type<int32_t, 4>::type
using int32x6_t = typename vector_type<int32_t, 6>::type
using int32x8_t = typename vector_type<int32_t, 8>::type
using int32x16_t = typename vector_type<int32_t, 16>::type
using int32x32_t = typename vector_type<int32_t, 32>::type
using int32x64_t = typename vector_type<int32_t, 64>::type
using int8x2_t = typename vector_type<int8_t, 2>::type
using int8x4_t = typename vector_type<int8_t, 4>::type
using int8x8_t = typename vector_type<int8_t, 8>::type
using int8x16_t = typename vector_type<int8_t, 16>::type
using int8x32_t = typename vector_type<int8_t, 32>::type
using int8x64_t = typename vector_type<int8_t, 64>::type
using f8x2_fnuz_t = typename vector_type<f8_fnuz_t, 2>::type
using f8x4_fnuz_t = typename vector_type<f8_fnuz_t, 4>::type
using f8x8_fnuz_t = typename vector_type<f8_fnuz_t, 8>::type
using f8x16_fnuz_t = typename vector_type<f8_fnuz_t, 16>::type
using f8x32_fnuz_t = typename vector_type<f8_fnuz_t, 32>::type
using f8x64_fnuz_t = typename vector_type<f8_fnuz_t, 64>::type
using bf8x2_fnuz_t = typename vector_type<bf8_fnuz_t, 2>::type
using bf8x4_fnuz_t = typename vector_type<bf8_fnuz_t, 4>::type
using bf8x8_fnuz_t = typename vector_type<bf8_fnuz_t, 8>::type
using bf8x16_fnuz_t = typename vector_type<bf8_fnuz_t, 16>::type
using bf8x32_fnuz_t = typename vector_type<bf8_fnuz_t, 32>::type
using bf8x64_fnuz_t = typename vector_type<bf8_fnuz_t, 64>::type
using f8x2_ocp_t = typename vector_type<f8_ocp_t, 2>::type
using f8x4_ocp_t = typename vector_type<f8_ocp_t, 4>::type
using f8x8_ocp_t = typename vector_type<f8_ocp_t, 8>::type
using f8x16_ocp_t = typename vector_type<f8_ocp_t, 16>::type
using f8x32_ocp_t = typename vector_type<f8_ocp_t, 32>::type
using f8x64_ocp_t = typename vector_type<f8_ocp_t, 64>::type
using bf8x2_ocp_t = typename vector_type<bf8_ocp_t, 2>::type
using bf8x4_ocp_t = typename vector_type<bf8_ocp_t, 4>::type
using bf8x8_ocp_t = typename vector_type<bf8_ocp_t, 8>::type
using bf8x16_ocp_t = typename vector_type<bf8_ocp_t, 16>::type
using bf8x32_ocp_t = typename vector_type<bf8_ocp_t, 32>::type
using bf8x64_ocp_t = typename vector_type<bf8_ocp_t, 64>::type
using uint8x2_t = typename vector_type<uint8_t, 2>::type
using uint8x4_t = typename vector_type<uint8_t, 4>::type
using uint8x8_t = typename vector_type<uint8_t, 8>::type
using uint8x16_t = typename vector_type<uint8_t, 16>::type
using uint8x32_t = typename vector_type<uint8_t, 32>::type
using uint8x64_t = typename vector_type<uint8_t, 64>::type
using f4x2_t = typename vector_type<f4x2_pk_t, 1>::type
using f4x4_t = typename vector_type<f4x2_pk_t, 2>::type
using f4x8_t = typename vector_type<f4x2_pk_t, 4>::type
using f4x16_t = typename vector_type<f4x2_pk_t, 8>::type
using f4x32_t = typename vector_type<f4x2_pk_t, 16>::type
using f4x64_t = typename vector_type<f4x2_pk_t, 32>::type
using f6x16_t = typename vector_type<f6x16_pk_t, 1>::type
using f6x16x2_t = typename vector_type<f6x16_pk_t, 2>::type
using f6x32_t = typename vector_type<f6x32_pk_t, 1>::type
using bf6x16_t = typename vector_type<bf6x16_pk_t, 1>::type
using bf6x16x2_t = typename vector_type<bf6x16_pk_t, 2>::type
using bf6x32_t = typename vector_type<bf6x32_pk_t, 1>::type
using e8m0x4_bexp_t = typename vector_type<e8m0_bexp_t, 4>::type
using pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type
using pk_i4x4_t = typename vector_type<pk_i4_t, 4>::type
using pk_i4x8_t = typename vector_type<pk_i4_t, 8>::type
template<bool B, typename T = void>
using enable_if = std::enable_if<B, T>
template<bool B, typename T = void>
using enable_if_t = typename std::enable_if<B, T>::type
template<bool predicate, class X, class Y>
using conditional_t = typename conditional<predicate, X, Y>::type
template<bool B>
using bool_constant = integral_constant<bool, B>
using true_type = bool_constant<true>
using false_type = bool_constant<false>
template<template< class... > class Op, class... Args>
using is_detected = typename detail::detector<nonesuch, void, Op, Args...>::value_t
template<typename T>
using is_pack2_invocable_t = decltype(ck::declval<T&>().is_pack2_invocable)
template<typename T>
using is_pack4_invocable_t = decltype(ck::declval<T&>().is_pack4_invocable)
template<typename T>
using is_pack8_invocable_t = decltype(ck::declval<T&>().is_pack8_invocable)
template<index_t N>
using Number = integral_constant<index_t, N>
template<index_t N>
using LongNumber = integral_constant<long_index_t, N>
template<index_t N>
using make_index_sequence
template<typename Sx, typename Sy>
using sequence_merge_t = typename sequence_merge<Sx, Sy>::type
template<index_t NSize, index_t I>
using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type
template<typename T, index_t N>
using StaticallyIndexedArray = typename detail::StaticallyIndexedArrayImpl<T, N>::type
template<index_t I, typename TTuple>
using tuple_element_t = typename tuple_element<I, TTuple>::type
template<typename T>
using is_tuple = decltype(ck::declval<T&>().IsTuple())
template<typename T>
using remove_reference_t = typename remove_reference<T>::type
template<typename T>
using remove_cv_t = typename remove_cv<T>::type
template<typename T>
using remove_cvref_t = remove_cv_t<remove_reference_t<T>>
template<typename T>
using remove_pointer_t = typename remove_pointer<T>::type

Enumerations

enum struct  InMemoryDataOperationEnum {
  Set ,
  AtomicAdd ,
  AtomicMax ,
  Add
}
enum  StreamKReductionStrategy {
  Atomic = 0 ,
  Reduction
}
enum struct  PipelineVersion {
  v1 ,
  v2 ,
  v4 ,
  weight_only
}
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
enum  Activation {
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1 ,
  gelu_and_mul = 0 ,
  silu_and_mul = 1
}
enum struct  DppInstr {
  dpp8_f16_1x32x2 = 0 ,
  dpp8_f16_2x16x2 ,
  dpp8_f16_2x32x2 ,
  dpp8_f16_4x16x2 ,
  dpp8_f16_4x32x2 ,
  dpp8_f16_8x16x2 ,
  dpp8_f16_8x32x2 ,
  dpp8_f16_16x16x2 ,
  dpp8_f16_32x8x2
}
enum struct  SmfmacInstr {
  smfmac_f32_16x16x32f16 = 0 ,
  smfmac_f32_32x32x16f16 ,
  smfmac_f32_16x16x32bf16 ,
  smfmac_f32_32x32x16bf16
}
enum struct  WmmaInstr {
  wmma_f32_16x16x16_f16 = 0 ,
  wmma_f32_16x16x16_bf16 ,
  wmma_f16_16x16x16_f16 ,
  wmma_bf16_16x16x16_bf16 ,
  wmma_i32_16x16x16_iu8 ,
  wmma_i32_16x16x16_iu4 ,
  wmma_f32_16x16x16_f16_gfx12 ,
  wmma_f32_16x16x16_bf16_gfx12 ,
  wmma_i32_16x16x16_iu8_gfx12 ,
  wmma_f32_16x16x16_f8f8_gfx12 ,
  wmma_f32_16x16x16_f8bf8_gfx12 ,
  wmma_f32_16x16x16_bf8f8_gfx12 ,
  wmma_f32_16x16x16_bf8bf8_gfx12
}
enum struct  MfmaInstr {
  mfma_f32_32x32x1f32 = 0 ,
  mfma_f32_16x16x1f32 ,
  mfma_f32_4x4x1f32 ,
  mfma_f32_32x32x2f32 ,
  mfma_f32_16x16x4f32 ,
  mfma_f32_32x32x4f16 ,
  mfma_f32_16x16x4f16 ,
  mfma_f32_4x4x4f16 ,
  mfma_f32_32x32x8f16 ,
  mfma_f32_16x16x16f16 ,
  mfma_f32_32x32x8bf16_1k ,
  mfma_f32_16x16x16bf16_1k ,
  mfma_f32_32x32x4bf16 ,
  mfma_f32_16x16x8bf16 ,
  mfma_i32_32x32x8i8 ,
  mfma_i32_16x16x16i8 ,
  mfma_i32_32x32x16i8 ,
  mfma_i32_16x16x32i8 ,
  mfma_f64_16x16x4f64 ,
  mfma_f32_32x32x16f8f8 ,
  mfma_f32_16x16x32f8f8 ,
  mfma_f32_32x32x16bf8bf8 ,
  mfma_f32_16x16x32bf8bf8 ,
  mfma_f32_32x32x16f8bf8 ,
  mfma_f32_16x16x32f8bf8 ,
  mfma_f32_32x32x16bf8f8 ,
  mfma_f32_16x16x32bf8f8 ,
  mfma_f32_32x32x16f16 ,
  mfma_f32_16x16x32f16 ,
  mfma_f32_32x32x16bf16 ,
  mfma_f32_16x16x32bf16 ,
  mfma_i32_32x32x32i8 ,
  mfma_i32_16x16x64i8 ,
  mfma_f32_32x32x64f8f6f4 ,
  mfma_f32_16x16x128f8f6f4 ,
  mfma_scale_f32_32x32x64f8f6f4 ,
  mfma_scale_f32_16x16x128f8f6f4 ,
  mfma_f32_16x16x8xf32 ,
  mfma_f32_32x32x4xf32 ,
  wmma_f32_16x16x16_f16 ,
  wmma_f32_16x16x16_bf16 ,
  wmma_i32_16x16x16_iu8 ,
  wmma_unsupport_16x16_gfx11 ,
  wmma_f32_16x16x16_f16_gfx12 ,
  wmma_f32_16x16x16_bf16_gfx12 ,
  wmma_i32_16x16x16_iu8_gfx12 ,
  wmma_f32_16x16x16_f8f8_gfx12 ,
  wmma_f32_16x16x16_f8bf8_gfx12 ,
  wmma_f32_16x16x16_bf8f8_gfx12 ,
  wmma_f32_16x16x16_bf8bf8_gfx12 ,
  wmma_unsupport_16x16_gfx12
}
enum struct  AddressSpaceEnum {
  Generic ,
  Global ,
  Lds ,
  Sgpr ,
  Vgpr
}
enum struct  AmdBufferCoherenceEnum {
  DefaultCoherence = 0 ,
  GLC = 1 ,
  SLC = 2 ,
  GLC_SLC = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11 ,
  DefaultCoherence = 0 ,
  GLC = 1 ,
  SLC = 2 ,
  GLC_SLC = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11
}
enum struct  AmdBufferCoherenceEnum {
  DefaultCoherence = 0 ,
  GLC = 1 ,
  SLC = 2 ,
  GLC_SLC = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11 ,
  DefaultCoherence = 0 ,
  GLC = 1 ,
  SLC = 2 ,
  GLC_SLC = 3 ,
  WAVE_NT0 = 0 ,
  WAVE_NT1 = 2 ,
  GROUP_NT0 = 1 ,
  GROUP_NT1 = 3 ,
  DEVICE_NT0 = 8 ,
  DEVICE_NT1 = 10 ,
  SYSTEM_NT0 = 9 ,
  SYSTEM_NT1 = 11
}
enum class  ck_fp8_interpretation_t {
  CK_E4M3_OCP = 0 ,
  CK_E5M2_OCP = 1 ,
  CK_E4M3_FNUZ = 2 ,
  CK_E5M2_FNUZ = 3
}
 Describes FP8 interpretation. More...
enum class  ck_saturation_t {
  CK_NOSAT = 0 ,
  CK_SATFINITE = 1
}
 Describes saturation behavior. More...
enum struct  BlockGemmPipelineVersion {
  v1 ,
  v2 ,
  v3 ,
  v4 ,
  v5
}
enum struct  BlockGemmPipelineScheduler {
  Intrawave ,
  Interwave
}
enum struct  TailNumber {
  Odd ,
  Even ,
  One ,
  Two ,
  Three ,
  Four ,
  Five ,
  Six ,
  Seven ,
  Empty ,
  Full
}
enum  SchedulerGroup : uint32_t {
  SCHED_GROUP_MFMA = 0x008 ,
  SCHED_GROUP_VMEM = 0x020 ,
  SCHED_GROUP_LDS_READ = 0x100 ,
  SCHED_GROUP_LDS_WRITE = 0x200
}
enum class  f8_rounding_mode {
  standard ,
  stochastic
}
enum struct  LoopScheduler {
  Default ,
  Interwave
}
enum struct  ReduceTensorOp {
  ADD = 0 ,
  MUL = 1 ,
  MIN = 2 ,
  MAX = 3 ,
  AMAX = 4 ,
  AVG = 5 ,
  NORM1 = 6 ,
  NORM2 = 7
}
enum struct  NanPropagation {
  NOT_PROPAGATE_NAN = 0 ,
  PROPAGATE_NAN = 1
}
enum struct  ReduceTensorIndices {
  NO_INDICES = 0 ,
  FLATTENED_INDICES = 1
}
enum struct  IndicesType {
  INDICES_32BIT = 0 ,
  INDICES_64BIT = 1 ,
  INDICES_16BIT = 2 ,
  INDICES_8BIT = 3
}

Functions

constexpr unsigned int fnv1a_hash (std::string_view str, unsigned int h=2166136261u)
std::string get_device_name ()
bool is_gfx12_supported ()
bool is_gfx11_supported ()
bool is_xdl_supported ()
template<typename ADataType, typename BDataType, index_t MPerXDL, index_t NPerXDL>
bool is_xdl_wmma_supported ()
bool is_lds_direct_load_supported ()
bool is_bf16_atomic_supported ()
bool is_gfx101_supported ()
bool is_gfx103_supported ()
bool is_wmma_supported ()
bool is_tf32_supported ()
template<typename T, typename ForwardIterator, typename Size, typename BinaryOperation>
auto accumulate_n (ForwardIterator first, Size count, T init, BinaryOperation op) -> decltype(std::accumulate(first, std::next(first, count), init, op))
unsigned int get_available_cpu_cores ()
template<typename... In, typename... Wei, typename... Out, typename ConvStrides, typename ConvDilations, typename InLeftPads, typename InRightPads, index_t GemmK1Value>
__host__ __device__ constexpr auto transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad (const TensorDescriptor< In... > &in_grid_desc_n_di_hi_wi_c, const TensorDescriptor< Wei... > &wei_k_z_y_x_c_grid_desc, const TensorDescriptor< Out... > &out_n_do_ho_wo_k_grid_desc, const ConvStrides &conv_strides, const ConvDilations &conv_dilations, const InLeftPads &in_left_pads, const InRightPads &in_right_pads, Number< GemmK1Value >)
template<AddressSpaceEnum AddressSpace, typename T, typename TensorDesc, typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ __device__ constexpr auto make_static_tensor (TensorDesc)
template<AddressSpaceEnum AddressSpace, typename T, typename TensorDesc, typename X, typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false, typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X > >::value, bool >::type = false>
__host__ __device__ constexpr auto make_static_tensor (TensorDesc, X invalid_element_value)
template<typename Lengths, typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
__host__ __device__ constexpr auto make_cluster_descriptor (const Lengths &lengths, ArrangeOrder order=typename arithmetic_sequence_gen< 0, Lengths::Size(), 1 >::type{})
template<typename LowLength>
__host__ __device__ constexpr auto make_pass_through_transform (const LowLength &low_length)
template<typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false>
__host__ __device__ constexpr auto make_pad_transform (const LowLength &low_length, const LeftPad &left_pad, const RightPad &right_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{})
template<typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
__host__ __device__ constexpr auto make_left_pad_transform (const LowLength &low_length, const LeftPadLength &left_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{})
template<typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false>
__host__ __device__ constexpr auto make_right_pad_transform (const LowLength &low_length, const RightPadLength &right_pad, integral_constant< bool, SkipIsValidCheck >=integral_constant< bool, false >{})
template<typename UpLengths, typename Coefficients, typename enable_if< UpLengths::Size()==Coefficients::Size(), bool >::type = false>
__host__ __device__ constexpr auto make_embed_transform (const UpLengths &up_lengths, const Coefficients &coefficients)
template<typename LowLengths>
__host__ __device__ constexpr auto make_merge_transform (const LowLengths &low_lengths)
template<typename LowLengths>
__host__ __device__ constexpr auto make_merge_transform_v1_carry_check (const LowLengths &low_lengths)
template<typename LowLengths>
__host__ __device__ constexpr auto make_merge_transform_v2_magic_division (const LowLengths &low_lengths)
template<typename LowLengths>
__host__ __device__ constexpr auto make_merge_transform_v3_division_mod (const LowLengths &low_lengths)
template<typename UpLengths, bool Use24BitIntegerCalculation = false>
__host__ __device__ constexpr auto make_unmerge_transform (const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
__host__ __device__ constexpr auto make_conv_bwd_data_out_transform (index_t N, index_t Ho, index_t Wo, index_t K, index_t YDot, index_t XDot, index_t HTilde, index_t WTilde, index_t ConvDilationH, index_t ConvDilationW, index_t HTildeSlice, index_t WTildeSlice, index_t YDotSlice, index_t XDotSlice, index_t IHTildeSliceBegin, index_t IWTildeSliceBegin, index_t GcdStrideDilationH, index_t GcdStrideDilationW, index_t K0, index_t K1, index_t MPerBlock, index_t GemmKPerBlock)
template<typename LowerIndex>
__host__ __device__ constexpr auto make_freeze_transform (const LowerIndex &low_idx)
template<typename UpperIndex>
__host__ __device__ constexpr auto make_insert_transform (const UpperIndex &up_idx)
template<typename LowLength, typename SliceBegin, typename SliceEnd>
__host__ __device__ constexpr auto make_slice_transform (const LowLength &low_length, const SliceBegin &slice_begin, const SliceEnd &slice_end)
template<typename VectorSize, typename UpLength>
__host__ __device__ constexpr auto make_vectorize_transform (const VectorSize &vector_size, const UpLength &up_length)
template<typename Modulus, typename UpLength>
__host__ __device__ constexpr auto make_modulo_transform (const Modulus &modulus, const UpLength &up_length)
template<typename LowLengths>
__host__ __device__ constexpr auto make_xor_with_modulo_transform (const LowLengths &low_lengths)
template<typename LowLengths>
__host__ __device__ constexpr auto make_xor_transform (const LowLengths &low_lengths)
template<typename TensorAdaptor0, typename TensorAdaptor1>
__host__ __device__ constexpr auto chain_tensor_adaptors (const TensorAdaptor0 &adaptor0, const TensorAdaptor1 &adaptor1)
template<typename Transforms, typename LowerDimensionOldTopIdss, typename UpperDimensionNewTopIdss>
__host__ __device__ constexpr auto make_single_stage_tensor_adaptor (const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
template<typename X, typename... Xs, typename enable_if< sizeof...(Xs) > = 2, bool, ::type = false>
__host__ __device__ constexpr auto chain_tensor_adaptors (const X &x, const Xs &... xs)
template<typename OldTensorDescriptor, typename NewTransforms, typename NewLowerDimensionOldVisibleIdss, typename NewUpperDimensionNewVisibleIdss>
__host__ __device__ constexpr auto transform_tensor_descriptor (const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
template<typename TensorDesc, typename VisibleIndex>
__host__ __device__ constexpr auto make_tensor_coordinate (const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
template<typename TensorDesc, typename VisibleIndex, typename UpdateLowerIndexHack>
__host__ __device__ constexpr auto make_tensor_coordinate_step (const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
template<typename TensorDesc, typename VisibleIndex>
__host__ __device__ constexpr auto make_tensor_coordinate_step (const TensorDesc &, const VisibleIndex &idx_diff_visible)
template<typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
__host__ __device__ constexpr void move_tensor_coordinate (const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
template<typename TensorDesc, typename TensorCoord>
__host__ __device__ constexpr bool coordinate_has_valid_offset_assuming_visible_index_is_valid (const TensorDesc &tensor_desc, const TensorCoord &coord)
template<typename TensorDesc, typename TensorCoord>
__host__ __device__ constexpr bool coordinate_has_valid_offset (const TensorDesc &tensor_desc, const TensorCoord &coord)
template<typename... Lengths, typename... Strides, typename enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
__host__ __device__ constexpr auto make_naive_tensor_descriptor (const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
template<typename... Lengths>
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed (const Tuple< Lengths... > &lengths)
template<typename... Lengths, typename Align>
__host__ __device__ constexpr auto make_naive_tensor_descriptor_aligned (const Tuple< Lengths... > &lengths, Align align)
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeTypeA, typename ComputeTypeB, typename AccDataType, typename AWmmaTileDesc, typename BWmmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false>
constexpr auto BlockGemmPipeline_Selector ()
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmABScalePipeline_Selector ()
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmMXBPreshufflePipeline_Selector ()
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmBPreshufflePipeline_Selector ()
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmPipeline_Selector ()
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmBlockScaleBPreshufflePipeline_Selector ()
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector ()
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmMXBPreshufflePipeline_Selector ()
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmMXNBSPipeline_Selector ()
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
constexpr auto BlockGemmMXPipeline_Selector ()
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
constexpr auto BlockGemmMXPipeline_Selector ()
template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool DirectLoad = false>
constexpr auto BlockGemmPipeline_Selector ()
template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename AK0MK1BlockDesc, typename BK0NK1BlockDesc, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, LoopScheduler LoopSched, typename ComputeTypeA = FloatA, typename ComputeTypeB = FloatB>
constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector ()
template<typename GridwiseGemm, typename FloatAB, typename FloatDsPointer, typename FloatE, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename ComputePtrOffsetOfBatch, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const index_t batch_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2ETileMap block_2_etile_map)
template<typename GridwiseGemm, typename BatchedGemmArg, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_batched_gemm_xdl_cshuffle_v3_multi_d (BatchedGemmArg karg)
template<typename GridwiseGemm, typename BatchedGemmArg, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds (BatchedGemmArg karg)
template<typename GridwiseGemm, typename BatchedGemmArg, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_batched_gemm_b_scale_xdl_cshuffle_v3 (BatchedGemmArg karg)
template<typename GridwiseGemm, typename BatchedGemmArg, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds (BatchedGemmArg karg)
template<typename GridwiseGemm, typename AsPointer, typename BsPointer, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AsGridDesc_AK0_M_AK1, typename BsGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_abd_xdl_cshuffle (AsPointer p_as_grid, BsPointer p_bs_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1, const BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
template<typename GridwiseGemm, typename FloatAB, typename FloatDsPointer, typename FloatE, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
template<typename GridwiseElementwiseReduction, typename InDataTypePointerTuple, typename XDataType, typename GammaDataType, typename BetaDataType, typename YDataType, typename AccDataType, typename XElementwiseOperation, typename YElementwiseOperation, typename InGrid2dDescTuple, typename GridDesc_M_K>
__global__ void kernel_elementwise_layernorm (const InGrid2dDescTuple in_grid_2d_desc_tuple, const GridDesc_M_K x_grid_desc_m_k, const GridDesc_M_K gamma_grid_desc_m_k, const GridDesc_M_K beta_grid_desc_m_k, const GridDesc_M_K y_grid_desc_m_k, index_t num_k_block_tile_iteration, AccDataType epsilon, const InDataTypePointerTuple p_in_global_tuple, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, const XElementwiseOperation x_elementwise_op, const YElementwiseOperation y_elementwise_op)
template<typename GridwiseGemm, typename ABDataType, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_K0_M0_M1_K1, typename BGridDesc_K0_N0_N1_K1, typename DsGridDesc_M0_M10_M11_N0_N10_N11, typename CGridDesc_M0_M10_M11_N0_N10_N11, typename Block2CTileMap, bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__global__ void kernel_gemm_dl_multiple_d (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1, const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1, const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11, const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11, const Block2CTileMap block_2_ctile_map)
template<typename GridwiseGemm, typename EMeanVarDataType, bool HasMainKBlockLoop, InMemoryDataOperationEnum EGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_multiple_d_welford_first_half_wmma_cshuffle_v3 (typename GridwiseGemm::Argument karg, EMeanVarDataType *__restrict__ p_welford_mean_grid, EMeanVarDataType *__restrict__ p_welford_var_grid, int32_t *__restrict__ p_welford_count_grid)
template<typename GridwiseWelfordLayernorm, typename EMeanVarDataType, typename HDataType, typename GammaDataType, typename BetaDataType, typename ComputeDataType, typename EHGridDesc_M_N, typename LayernormMeanVarGridDesc_M_NBlock, typename LayernormCountGridDesc_M_NBlock, typename GammaBetaGridDesc_N, typename HElementwiseOperation>
__global__ void kernel_welford_layernorm2d_second_half (const EMeanVarDataType *__restrict__ p_e_grid, const EMeanVarDataType *__restrict__ p_in_welford_mean_grid, const EMeanVarDataType *__restrict__ p_in_welford_var_grid, const int32_t *__restrict__ p_in_welford_count_grid, const GammaDataType *__restrict__ p_gamma_grid, const BetaDataType *__restrict__ p_beta_grid, HDataType *__restrict__ p_h_grid, const EHGridDesc_M_N e_grid_desc_m_n, const EHGridDesc_M_N h_grid_desc_m_n, const LayernormMeanVarGridDesc_M_NBlock mean_var_grid_desc_m_nblock, const LayernormCountGridDesc_M_NBlock count_grid_desc_m_nblock, const GammaBetaGridDesc_N gamma_grid_desc_n, const GammaBetaGridDesc_N beta_grid_desc_n, index_t numMeanVarCountBlockTileIteration_N, index_t NBlockClusterLength, ComputeDataType epsilon, HElementwiseOperation h_element_op)
template<typename GridwiseGemmWelford, typename ABDataType, typename DsPointer, typename EMeanVarDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename MeanVarGridDescriptor_MBlock_MPerBlock_NBlock, typename CountGridDescriptor_MBlock_MPerBlock_NBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EMeanVarDataType *__restrict__ p_e_grid, EMeanVarDataType *__restrict__ p_welford_mean_grid, EMeanVarDataType *__restrict__ p_welford_var_grid, int32_t *__restrict__ p_welford_count_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const MeanVarGridDescriptor_MBlock_MPerBlock_NBlock mean_var_grid_desc_mblock_mperblock_nblock, const CountGridDescriptor_MBlock_MPerBlock_NBlock count_grid_desc_mblock_mperblock_nblock, const Block2ETileMap block_2_etile_map, index_t NRaw)
template<typename GridwiseGemm, typename FloatAB, typename FloatDsPointer, typename FloatE, typename FloatRsPointer, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename RsGridDescriptor_MBlock_MPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void kernel_gemm_multiple_d_multiple_r_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, FloatRsPointer p_rs_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const QsElementwiseOperation qs_element_op, const RsElementwiseOperation rs_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock, const Block2ETileMap block_2_etile_map)
template<typename GridwiseGemm, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void kernel_gemm_multiple_d_xdl_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
template<typename GridwiseGemm, typename ABDataType, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename EElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdl_waveletmodel_cshuffle (const ABDataType *__restrict__ p_a_grid, const ABDataType *__restrict__ p_b_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const EElementwiseOperation e_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
template<typename GridwiseGemm, typename ContractionMultiDKernelArg, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, bool HasMainKBlockLoop>
__global__ void kernel_grouped_contraction_multiple_d_xdl_cshuffle (const void CK_CONSTANT_ADDRESS_SPACE *contraction_args, const index_t group_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op)
template<typename GridwiseWelford, typename XDataType, typename WorkspaceMeanVarDataType, typename ComputeDataType, typename XGridDesc_M_K, typename MeanVarGridDesc_M_KBlock>
__global__ void kernel_normalizationSplitK1st (const XGridDesc_M_K x_grid_desc_m_k, const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock, index_t num_k_block_tile_iteration, const XDataType *const __restrict__ p_x_global, WorkspaceMeanVarDataType *const __restrict__ p_welford_mean, WorkspaceMeanVarDataType *const __restrict__ p_welford_variance, int32_t *const __restrict__ p_welford_count)
template<typename GridwiseWelfordNormalization, typename WorkspaceMeanVarDataType, typename XDataType, typename GammaDataType, typename BetaDataType, typename YDataType, typename SaveMeanInvStdDataType, typename ComputeDataType, typename YElementwiseOperation, typename MeanVarGridDesc_M_KBlock, typename CountGridDesc_M_KBlock, typename XYGammaBetaGridDesc_M_K, typename SaveMeanInvStdGridDesc_M>
__global__ void kernel_normalizationSplitK2nd (const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock, const CountGridDesc_M_KBlock count_grid_desc_m_kblock, const XYGammaBetaGridDesc_M_K x_grid_desc_m_k, const XYGammaBetaGridDesc_M_K gamma_grid_desc_m_k, const XYGammaBetaGridDesc_M_K beta_grid_desc_m_k, const XYGammaBetaGridDesc_M_K y_grid_desc_m_k, const SaveMeanInvStdGridDesc_M save_mean_grid_desc_m, const SaveMeanInvStdGridDesc_M save_inv_std_grid_desc_m, index_t num_k_mean_var_count_iteration, index_t num_k_block_tile_iteration, index_t k_grid_size, ComputeDataType epsilon, const WorkspaceMeanVarDataType *const p_mean_global, const WorkspaceMeanVarDataType *const p_variance_global, const int32_t *const p_welford_count_global, const XDataType *const __restrict__ p_x_global, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, SaveMeanInvStdDataType *const __restrict__ p_save_mean_global, SaveMeanInvStdDataType *const __restrict__ p_save_inv_std_global, const YElementwiseOperation y_elementwise_op)
template<typename GridwiseGemm, typename FloatAB, typename FloatDsPointer, typename FloatE, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AKB_AK0_M_AK1, typename BGridDesc_BKB_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename ComputePtrOffsetOfBatch, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_d_xdl_cshuffle (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatDsPointer p_ds_grid, FloatE *__restrict__ p_e_grid, const index_t batch_count, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AKB_AK0_M_AK1 a_grid_desc_akb_ak0_m_ak1, const BGridDesc_BKB_BK0_N_BK1 b_grid_desc_bkb_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2ETileMap block_2_etile_map)
__device__ half4_t i4_to_half4 (int q)
__device__ half4_t i4_to_half4_scale (int q, const ck::half2_t &scale)
__device__ f8x4_t i4_to_f8x4 (int q)
__device__ f8x8_t i4_to_fp8x8 (int q)
__device__ bhalf4_t i4_to_bhalf4 (int q)
template<typename GridwiseMultiblockBatchNormForward_, typename XDataType, typename YDataType, typename AccDataType, typename ScaleDataType, typename BiasDataType, typename MeanVarDataType, typename YElementwiseOp, typename XYGridDesc_M_K, typename MeanVarCountGridDesc_M_G, typename MeanVarCountGridDesc_M_K, typename ScaleBiasGridDesc_M, typename MeanVarGridDesc_M, typename GetReduceCountPerThreadFunctor>
__global__ void kernel_multiblock_batchnorm_forward (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, MeanVarDataType *const __restrict__ p_welford_mean, MeanVarDataType *const __restrict__ p_welford_variance, int32_t *const __restrict__ p_welford_count, int32_t *const __restrict__ p_control, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance)
template<typename GridwiseReduceSecondHalfBatchNormBackwardFinal_, typename XDataType, typename DyDataType, typename DxDataType, typename ScaleDataType, typename DscaleDbiasDataType, typename MeanVarDataType, typename DyElementwiseOp, typename XYGridDesc_M_K, typename DscaleDbiasGridDesc_M_K, typename MeanVarGridDesc_M, typename ScaleBiasGridDesc_M>
__global__ void kernel_reduce_second_half_batchnorm_backward_final (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const XYGridDesc_M_K dx_grid_desc_m_k, const DscaleDbiasGridDesc_M_K dscale_dbias_grid_desc_m_k, const MeanVarGridDesc_M mean_var_grid_desc_m, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, index_t blkgroup_size, long_index_t reduce_size, index_t num_xy_k_block_tile_iteration, index_t num_dscale_dbias_k_block_tile_iteration, const DscaleDbiasDataType *const __restrict__ p_reduce_dscale, const DscaleDbiasDataType *const __restrict__ p_reduce_dbias, const MeanVarDataType *const __restrict__ p_mean, const MeanVarDataType *const __restrict__ p_inv_var, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, const ScaleDataType *const __restrict__ p_scale, const DyElementwiseOp dy_elementwise_op, DxDataType *const __restrict__ p_dx, DscaleDbiasDataType *const __restrict__ p_dscale, DscaleDbiasDataType *const __restrict__ p_dbias)
template<typename GridwiseMultiblockWelfordFirstHalf_, typename XDataType, typename MeanVarDataType, typename XGridDesc_M_K, typename MeanVarCountGridDesc_M_G, typename GetReduceCountPerThreadFunctor>
__global__ void kernel_multiblock_welford_first_half (const XGridDesc_M_K x_grid_desc_m_k, const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, const XDataType *const __restrict__ p_x, MeanVarDataType *const p_welford_mean, MeanVarDataType *const p_welford_variance, int32_t *const p_welford_count)
template<typename GridwiseWelfordSecondHalfBatchNormForwardFinal_, typename XDataType, typename YDataType, typename AccDataType, typename ScaleDataType, typename BiasDataType, typename MeanVarDataType, typename YElementwiseOp, typename XYGridDesc_M_K, typename MeanVarCountGridDesc_M_K, typename ScaleBiasGridDesc_M, typename MeanVarGridDesc_M>
__global__ void kernel_welford_second_half_batchnorm_forward_final (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, index_t blkgroup_size, index_t num_xy_k_block_tile_iteration, AccDataType epsilon, const MeanVarDataType *const __restrict__ p_in_welford_mean, const MeanVarDataType *const __restrict__ p_in_welford_variance, const int32_t *const __restrict__ p_in_welford_count, const XDataType *const __restrict__ p_x, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance)
template<typename GridwiseWelfordSecondHalfReduceFirstHalf_, typename XDataType, typename DyDataType, typename AccDataType, typename ScaleDataType, typename DscaleDbiasDataType, typename MeanVarDataType, typename DyElementwiseOp, typename XYGridDesc_M_K, typename MeanVarGridDesc_M, typename MeanVarCountGridDesc_M_K, typename DscaleDbiasGridDesc_M_G>
__global__ void kernel_welford_second_half_reduce_first_half (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const MeanVarGridDesc_M mean_var_grid_desc_m, const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k, const DscaleDbiasGridDesc_M_G dscale_dbias_grid_desc_m_g, index_t blkgroup_size, index_t num_xy_k_block_tile_iteration, index_t num_mean_var_count_k_block_tile_iteration, AccDataType epsilon, bool haveSavedMeanInvVar, const MeanVarDataType *const __restrict__ p_savedMean, const MeanVarDataType *const __restrict__ p_savedInvVar, const MeanVarDataType *const __restrict__ p_in_welford_mean, const MeanVarDataType *const __restrict__ p_in_welford_variance, const int32_t *const __restrict__ p_in_welford_count, const DyElementwiseOp dy_elementwise_op, MeanVarDataType *const __restrict__ p_out_welford_mean, MeanVarDataType *const __restrict__ p_out_welford_inv_variance, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, DscaleDbiasDataType *const __restrict__ p_reduce_dscale, DscaleDbiasDataType *const __restrict__ p_reduce_dbias)
template<typename CTileIdx, typename CTileDim>
__host__ __device__ bool DefaultValidCTileIndex (const CTileIdx &c_tile_idx, const CTileDim &c_tile_dim)
template<typename GridwiseMultipleReduction, index_t NumReduction, typename InDataType, typename OutDataTypePointerTuple, typename AccDataType, typename InGridDesc_M_K, typename OutGridDesc_M_Tuple, typename InElementwiseOperationTuple, typename AccElementwiseOperationTuple>
__global__ void kernel_multiple_reduce_multiblock (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M_Tuple out_grid_desc_m_tuple, const InElementwiseOperationTuple in_elementwise_op_tuple, const AccElementwiseOperationTuple acc_elementwise_op_tuple, index_t block_group_size, index_t num_k_block_tile_iteration, Array< AccDataType, NumReduction > alpha_values, const InDataType *const __restrict__ p_in_value_global, Array< AccDataType, NumReduction > beta_values, OutDataTypePointerTuple p_out_value_global_tuple)
template<typename GridwiseMultipleReduction, index_t NumReduction, typename InDataType, typename OutDataTypePointerTuple, typename AccDataType, typename InGridDesc_M_K, typename OutGridDesc_M_Tuple, typename InElementwiseOperationTuple, typename AccElementwiseOperationTuple>
__global__ void kernel_multiple_reduce_threadwise (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M_Tuple out_grid_desc_m_tuple, const InElementwiseOperationTuple in_elementwise_op_tuple, const AccElementwiseOperationTuple acc_elementwise_op_tuple, Array< AccDataType, NumReduction > alpha_values, const InDataType *const __restrict__ p_in_value_global, Array< AccDataType, NumReduction > beta_values, OutDataTypePointerTuple p_out_value_global_tuple)
template<typename GridwiseReduction, bool OutputIndex, bool HaveIndexInput, typename InDataType, typename OutDataType, typename AccDataType, typename IndexDataType, typename InGridDesc_M_K, typename OutGridDesc_M, typename InElementwiseOperation, typename AccElementwiseOperation>
__global__ void kernel_reduce_multiblock (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, const IndexDataType *const __restrict__ p_in_index_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global, IndexDataType *const __restrict__ p_out_index_global)
template<typename GridwiseReduction, bool OutputIndex, bool TransformIndexKtoGlobal, bool HaveIndexInput, typename InDataType, typename OutDataType, typename AccDataType, typename IndexDataType, typename InGridDesc_M_K, typename OutGridDesc_M, typename InElementwiseOperation, typename AccElementwiseOperation>
__global__ void kernel_reduce_threadwise (const InGridDesc_M_K in_grid_desc_m_k, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const AccElementwiseOperation acc_elementwise_op, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, const IndexDataType *const __restrict__ p_in_index_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global, IndexDataType *const __restrict__ p_out_index_global)
template<typename GridwiseReduction, typename InDataType, typename OutDataType, typename AccDataType, typename InGridDesc_M_K, typename DsGridDesc_M, typename OutGridDesc_M, typename InElementwiseOperation, typename OutElementwiseOperation, typename DsGridPointer>
__global__ void kernel_reduce_threadwise_multi_d (const InGridDesc_M_K in_grid_desc_m_k, const DsGridDesc_M ds_grid_desc_m, const OutGridDesc_M out_grid_desc_m, const InElementwiseOperation in_elementwise_op, const OutElementwiseOperation out_elementwise_op, const InDataType *const __restrict__ p_in_value_global, const DsGridPointer p_ds_value_global, OutDataType *const __restrict__ p_out_value_global)
template<typename GridwiseBatchrNormBackwardWithBlockwiseWelford_, typename XDataType, typename DyDataType, typename DxDataType, typename AccDataType, typename ScaleDataType, typename DscaleDbiasDataType, typename MeanVarDataType, typename DyElementwiseOp, typename XYGridDesc_M_K, typename ScaleBiasGridDesc_M, typename MeanVarGridDesc_M, typename GetReduceCountPerThreadFunctor>
__global__ void kernel_batchnorm_backward_with_blockwise_welford (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K dy_grid_desc_m_k, const XYGridDesc_M_K dx_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M dscale_dbias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, long_index_t reduce_size, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, const DyDataType *const __restrict__ p_dy, const ScaleDataType *const __restrict__ p_scale, bool haveSavedMeanInvVar, const MeanVarDataType *const __restrict__ p_savedMean, const MeanVarDataType *const __restrict__ p_savedInvVar, const DyElementwiseOp dy_elementwise_op, DxDataType *const __restrict__ p_dx, DscaleDbiasDataType *const __restrict__ p_dscale, DscaleDbiasDataType *const __restrict__ p_dbias)
template<typename GridwiseBatchrNormForwardWithBlockwiseWelford_, typename XDataType, typename YDataType, typename AccDataType, typename ScaleDataType, typename BiasDataType, typename MeanVarDataType, typename YElementwiseOp, typename XYGridDesc_M_K, typename ScaleBiasGridDesc_M, typename MeanVarGridDesc_M, typename GetReduceCountPerThreadFunctor>
__global__ void kernel_batchnorm_forward_with_blockwise_welford (const XYGridDesc_M_K x_grid_desc_m_k, const XYGridDesc_M_K y_grid_desc_m_k, const ScaleBiasGridDesc_M scale_grid_desc_m, const ScaleBiasGridDesc_M bias_grid_desc_m, const MeanVarGridDesc_M mean_var_grid_desc_m, const GetReduceCountPerThreadFunctor get_reduce_count_per_thread, index_t num_k_block_tile_iteration, AccDataType epsilon, const XDataType *const __restrict__ p_x, const ScaleDataType *const __restrict__ p_scale, const BiasDataType *const __restrict__ p_bias, const YElementwiseOp y_elementwise_op, YDataType *const __restrict__ p_y, bool updateMovingAverage, AccDataType averageFactor, MeanVarDataType *const __restrict__ resultRunningMean, MeanVarDataType *const __restrict__ resultRunningVariance, bool saveMeanInvVariance, MeanVarDataType *const __restrict__ resultSaveMean, MeanVarDataType *const __restrict__ resultSaveInvVariance)
template<typename GridwiseElementwise1dFunctor, typename InGrid1dDescTuple, typename OutGrid1dDescTuple, typename InDataTypePointerTuple, typename OutDataTypePointerTuple, typename ElementwiseOperation, typename UnaryOperation, typename Scale>
__global__ void kernel_elementwise_1d (const InGrid1dDescTuple in_grid_1d_desc_tuple, const OutGrid1dDescTuple out_grid_1d_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const ElementwiseOperation elementwise_op, const UnaryOperation unary_op, const Scale scale_op)
template<typename GridwiseElementwiseFunctor, typename InGridDescTuple, typename OutGridDescTuple, typename InDataTypePointerTuple, typename OutDataTypePointerTuple, typename Block2TileMap, typename ElementwiseOperation>
__global__ void kernel_elementwise (const InGridDescTuple in_grid_desc_tuple, const OutGridDescTuple out_grid_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const Block2TileMap block_2_tile_map, const ElementwiseOperation elementwise_op)
template<typename GridwiseElementwiseFunctorA, typename GridwiseElementwiseFunctorB, typename InAGridDescTuple, typename InBGridDescTuple, typename OutAGridDescTuple, typename OutBGridDescTuple, typename InADataTypePointerTuple, typename InBDataTypePointerTuple, typename OutADataTypePointerTuple, typename OutBDataTypePointerTuple, typename Block2TileMapA, typename Block2TileMapB, typename ElementwiseOperation>
__global__ void kernel_elementwise_dual (const InAGridDescTuple in_grid_desc_tuple_a, const InBGridDescTuple in_grid_desc_tuple_b, const OutAGridDescTuple out_grid_desc_tuple_a, const OutBGridDescTuple out_grid_desc_tuple_b, const InADataTypePointerTuple p_in_global_tuple_a, const InBDataTypePointerTuple p_in_global_tuple_b, const OutADataTypePointerTuple p_out_global_tuple_a, const OutBDataTypePointerTuple p_out_global_tuple_b, const Block2TileMapA block_2_tile_map_a, const Block2TileMapB block_2_tile_map_b, const ElementwiseOperation elementwise_op, const index_t a_grid_size)
template<typename GridwiseElementwiseFunctorA, typename GridwiseElementwiseFunctorB, typename InAGridDescTuple, typename InBGridDescTuple, typename OutAGridDescTuple, typename OutBGridDescTuple, typename InADataTypePointerTuple, typename InBDataTypePointerTuple, typename OutADataTypePointerTuple, typename OutBDataTypePointerTuple, typename Block2TileMapA, typename Block2TileMapB, typename ElementwiseOperation, index_t NumInputsA, index_t NumInputsB, index_t NumOutputsA, index_t NumOutputsB>
__global__ void kernel_elementwise_batched_dual (const InAGridDescTuple in_grid_desc_tuple_a, const InBGridDescTuple in_grid_desc_tuple_b, const OutAGridDescTuple out_grid_desc_tuple_a, const OutBGridDescTuple out_grid_desc_tuple_b, const InADataTypePointerTuple p_in_global_tuple_a, const InBDataTypePointerTuple p_in_global_tuple_b, const OutADataTypePointerTuple p_out_global_tuple_a, const OutBDataTypePointerTuple p_out_global_tuple_b, const Block2TileMapA block_2_tile_map_a, const Block2TileMapB block_2_tile_map_b, const ElementwiseOperation elementwise_op, const index_t a_grid_size, const index_t batch_count_a, const index_t batch_count_b, const std::array< index_t, NumInputsA > input_batch_strides_a, const std::array< index_t, NumInputsB > input_batch_strides_b, const std::array< index_t, NumOutputsA > output_batch_strides_a, const std::array< index_t, NumOutputsB > output_batch_strides_b)
template<typename GridwiseElementwiseFunctor, typename InGridDescTuple, typename OutGridDescTuple, typename InDataTypePointerTuple, typename OutDataTypePointerTuple, typename Block2TileMap, typename ElementwiseOperation, index_t NumInputs, index_t NumOutputs>
__global__ void kernel_batched_elementwise (const InGridDescTuple in_grid_desc_tuple, const OutGridDescTuple out_grid_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const Block2TileMap block_2_tile_map, const ElementwiseOperation elementwise_op, const index_t batch_count, const std::array< index_t, NumInputs > input_batch_strides, const std::array< index_t, NumOutputs > output_batch_strides)
template<typename GridwiseGemm, typename ADataType, typename BDataType, typename ScaleDataType, typename CDataType, typename AGridDesc, typename BGridDesc, typename ScaleGridDesc, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void kernel_fpAintB_gemm_wmma (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, const ScaleDataType *__restrict__ p_scale_grid, CDataType *__restrict__ p_c_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const ScaleGridDesc scale_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename FloatC0, typename FloatC1, typename ReducePtrsGlobal, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename C1ElementwiseOperation, typename ReduceInElementwiseOperations, typename ReduceAccElementwiseOperations, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename ReduceGridDescriptor_MBlock_MPerBlock, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void kernel_gemm_bias_add_reduce_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC0 *__restrict__ p_bias_grid, const FloatC1 *__restrict__ p_d0_grid, ReducePtrsGlobal p_reduces_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const C1ElementwiseOperation c1_element_op, const ReduceInElementwiseOperations reduce_in_element_ops, const ReduceAccElementwiseOperations reduce_out_element_ops, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c0_grid_desc_mblock_mperblock_nblock_nperblock, const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c1_grid_desc_mblock_mperblock_nblock_nperblock, const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock, const Block2CTileMap block_2_ctile_map)
template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M0_M1_K1, typename BGridDesc_K0_N0_N1_K1, typename CGridDesc_M0_M10_M11_N0_N10_N11, typename Block2CTileMap, bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__global__ void kernel_gemm_dl_v1r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1, const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1, const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11, const Block2CTileMap block_2_ctile_map)
template<typename GridwiseGemm, bool HasMainKBlockLoop>
__global__ void kernel_gemm_dpp (const typename GridwiseGemm::Argument karg)
template<typename GridwiseOp, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2CTileMap, typename ComputePtrOffsetOfBatch, bool HasMainKBlockLoop>
__global__ void kernel_grouped_conv_multiple_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const index_t batch_count, const AGridDesc_AK0_M_AK1 a_grid_desc, const BGridDesc_BK0_N_BK1 b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_, const Block2CTileMap block_2_ctile_map, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
template<typename GridwiseOp, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AGridDesc, typename BGridDesc, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename ComputePtrOffsetOfBatch, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void kernel_contraction_multiple_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const index_t batch_count, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2CTileMap block_2_etile_map)
template<typename GridwiseOp, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AGridDesc, typename BGridDesc, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void kernel_gemm_mupltipe_d_wmma_cshuffle (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const Block2CTileMap block_2_ctile_map)
template<typename GridwiseGemm, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const Block2ETileMap block_2_etile_map)
template<PipelineVersion PipelineVer, index_t NumPrefetch = 1, LoopScheduler LoopSched = LoopScheduler::Default, bool AEnableLds = true, bool BEnableLds = true>
constexpr auto GridwiseGemmPipeline_Selector ()
template<index_t NumPrefetch, LoopScheduler LoopSched>
constexpr auto GridwiseGemmPipeline_v1_Selector ()
template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename ReducePtrsGlobal, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename ReduceInElementwiseOperations, typename ReduceAccElementwiseOperations, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename ReduceGridDescriptor_MBlock_MPerBlock, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void kernel_gemm_reduce_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, ReducePtrsGlobal p_reduces_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const ReduceInElementwiseOperations reduce_in_element_ops, const ReduceAccElementwiseOperations reduce_out_element_ops, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock, const Block2CTileMap block_2_ctile_map)
template<typename GridwiseGemm, typename ADataType, typename BDataType, typename CDataType, typename AGridDesc, typename BGridDesc, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void kernel_gemm_wmma (const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, CDataType *__restrict__ p_c_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum EGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_wmma_cshuffle_v3 (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_xdl_cshuffle_v3 (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_xdl_cshuffle_v3_2lds (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdl_cshuffle_v1 (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, typename FloatA, typename FloatB, typename FloatC, bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdl_cshuffle_v1 (const FloatA *__restrict__ p_a_grid, const FloatB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, typename GridwiseGemm::Problem problem)
template<typename GridwiseGemm, bool HasMainKBlockLoop, index_t TailNum = 3>
__global__ void kernel_gemm_xdl_cshuffle_v2 (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, typename FloatA, typename FloatB, typename FloatC, bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdl_cshuffle_v2 (const FloatA *p_a_grid, const FloatB *p_b_grid, FloatC *p_c_grid, typename GridwiseGemm::Problem problem)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_b_preshuffle (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_2lds (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds (typename GridwiseGemm::Argument karg)
template<bool Use2LDS, typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ enable_if_t<!Use2LDS, void > kernel_gemm_xdl_cshuffle_v3_mx (typename GridwiseGemm::Argument karg)
template<bool Use2LDS, typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ enable_if_t< Use2LDS, void > kernel_gemm_xdl_cshuffle_v3_mx (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename FloatC0, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename C0GridDescriptor_NBlock_NPerBlock, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void kernel_gemm_layernorm_xdl_cshuffle_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC0 *__restrict__ p_c0_bias_grid, const FloatC0 *__restrict__ p_c0_add_grid, const FloatC0 *__restrict__ p_c0_gamma_grid, const FloatC0 *__restrict__ p_c0_beta_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const AccElementwiseOperation acc_element_op, const CElementwiseOperation c_element_op, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock, const Block2CTileMap block_2_ctile_map)
template<typename LowLengths>
__host__ __device__ constexpr auto make_merge_transform_v4_no_carry (const LowLengths &low_lengths)
template<typename GridwiseGemm, typename FloatA, typename FloatB, typename FloatC, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename CBlockClusterAdaptor, bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_bwd_weight (const FloatA *__restrict__ p_a_grid, const FloatB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc, const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc, const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const CBlockClusterAdaptor c_block_cluster_adaptor)
template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainK0BlockLoop>
__global__ void kernel_gemm_xdlops_skip_b_lds_v1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDesc_M_N c_grid_desc_m_n, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation>
__global__ void kernel_gemm_xdlops_splitk_lds_direct_load (typename GridwiseGemm::Argument karg, const Block2CTileMap &b2c_map, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op)
template<typename GridwiseGemm>
__global__ void kernel_gemm_xdlops_streamk (const typename GridwiseGemm::FloatAB *p_a_grid, const typename GridwiseGemm::FloatAB *p_b_grid, typename GridwiseGemm::FloatC *p_c_grid, void *p_workspace, index_t M, index_t N, index_t K, index_t StrideA, index_t StrideB, index_t StrideC, typename GridwiseGemm::Block2CTileMap block_mapping)
template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v2r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDesc_M_N c_grid_desc_m_n)
template<typename GridwiseGemm, bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v2r3 (const typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename ABK0MK1GridDesc, typename BBK0NK1GridDesc, typename CM0N0M1N1M2M3M4N2GridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename CBlockClusterAdaptor, bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v2r4 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const ABK0MK1GridDesc a_b_k0_m_k1_grid_desc, const BBK0NK1GridDesc b_b_k0_n_k1_grid_desc, const CM0N0M1N1M2M3M4N2GridDesc c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const CBlockClusterAdaptor c_block_cluster_adaptor)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation>
__global__ void kernel_gemm_xdlops_v2r4r2_simplified (typename GridwiseGemm::Argument karg, const Block2CTileMap &b2c_map, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op)
template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainK0BlockLoop>
__global__ void kernel_gemm_xdlops_v3r1 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v3r2 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC *__restrict__ p_c0_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void kernel_gemm_xdlops_v3r3 (const FloatAB *__restrict__ p_a_grid, const FloatAB *__restrict__ p_b_grid, FloatC *__restrict__ p_c_grid, const FloatC *__restrict__ p_c0_grid, const FloatC *__restrict__ p_c1_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CElementwiseOperation c_element_op, const Block2CTileMap block_2_ctile_map)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_moe_gemm (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_moe_gemm_2lds (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_moe_mxgemm_2lds (typename GridwiseGemm::Argument karg)
template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void kernel_moe_mxgemm (typename GridwiseGemm::Argument karg)
template<typename GridwisePermute, typename InGridDesc, typename OutGridDesc, typename InDataType, typename OutDataType, typename ElementwiseOperation, typename Block2TileMap>
__global__ void kernel_nd_permute (const InGridDesc in_grid_desc, const OutGridDesc out_grid_desc, const InDataType *p_in_global, OutDataType *p_out_global, const ElementwiseOperation elementwise_op, const Block2TileMap block_2_tile_map)
template<typename GridwisePutElementwise1dFunctor, typename InGrid1dDesc, typename InDataType, typename IndexDataType, typename OutDataType, typename ElementwiseOperation>
__global__ void kernel_put_element_1d (const InGrid1dDesc in_grid_1d_desc, const InDataType *__restrict__ p_in_global, const IndexDataType *__restrict__ p_indices_global, OutDataType *__restrict__ p_out_global, const ElementwiseOperation elementwise_op)
template<index_t BlockSize, typename DataType, typename Grid1dBufferDescType>
__global__ void kernel_buffer_set_value (const Grid1dBufferDescType grid_1d_buffer_desc, DataType *const __restrict__ p_global, DataType value)
template<typename Grid1dBufferDescTuple, index_t NumBuffer, index_t BlockSize, typename DataTypePointerTuple, typename DataTypeTuple>
__global__ void kernel_multiple_buffer_set_value (const Grid1dBufferDescTuple grid_1d_buffer_desc_tuple, DataTypePointerTuple p_global_tuple, DataTypeTuple value_tuple)
template<typename GridwiseReduction, typename InDataType, typename OutDataType, typename AccDataType, typename GridDesc_M_K>
__global__ void kernel_softmax (const GridDesc_M_K in_grid_desc_m_k, const GridDesc_M_K out_grid_desc_m_k, index_t block_group_size, index_t num_k_block_tile_iteration, AccDataType alpha, const InDataType *const __restrict__ p_in_value_global, AccDataType beta, OutDataType *const __restrict__ p_out_value_global)
template<typename GridwiseSparseEmbedding, typename EmbType, typename IndexType, typename GammaDataType, typename BetaDataType, typename AccDataType, typename OutType, typename OutGridDesc, typename EmbElementwiseOperation, ck::index_t NumEmbeddings>
__global__ void kernel_sparse_embeddings_forward_layernorm (OutType *p_out, const ck::Array< EmbType *, NumEmbeddings > p_embs, const ck::Array< IndexType *, NumEmbeddings > p_indexes, const GammaDataType *p_gamma, const BetaDataType *p_beta, const OutGridDesc out_grid_desc, const AccDataType epsilon, const EmbElementwiseOperation emb_elementwise_op)
template<typename InputGridDesc, typename InputDataType, typename OutputGridDesc, typename OutputDataType, typename Block2ETileMap, typename ComputePtrOffsetOfStridedBatch, typename GridwiseTensorRearrangeKernel>
__global__ void kernel_tensor_rearrange (const InputGridDesc in_grid_desc, const InputDataType *__restrict__ p_in_global, const OutputGridDesc out_grid_desc, OutputDataType *__restrict__ p_out_global, const index_t batch_count, const Block2ETileMap block_2_tile_map, const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
template<typename GridwiseReduction, typename XDataType, typename GammaDataType, typename BetaDataType, typename YDataType, typename SaveMeanInvStdDataType, typename ComputeDataType, typename YElementwiseOperation, typename GridDesc_M_K, typename GridDesc_M>
__global__ void kernel_normalization (const GridDesc_M_K x_grid_desc_m_k, const GridDesc_M_K gamma_grid_desc_m_k, const GridDesc_M_K beta_grid_desc_m_k, const GridDesc_M_K y_grid_desc_m_k, const GridDesc_M save_mean_grid_desc_m, const GridDesc_M save_inv_std_grid_desc_m, index_t num_k_block_tile_iteration, ComputeDataType epsilon, const XDataType *const __restrict__ p_x_global, const GammaDataType *const __restrict__ p_gamma_global, const BetaDataType *const __restrict__ p_beta_global, YDataType *const __restrict__ p_y_global, SaveMeanInvStdDataType *const __restrict__ p_save_mean_global, SaveMeanInvStdDataType *const __restrict__ p_save_inv_std_global, const YElementwiseOperation y_elementwise_op)
template<typename XDataType, typename GammaDataType, typename BetaDataType, typename YDataType, typename SaveMeanInvStdDataType, typename ComputeDataType, typename YElementwiseOperation, typename GridDesc_M_K, typename GridDesc_M, index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t XSrcVectorDim, index_t XSrcVectorSize, index_t GammaSrcVectorDim, index_t GammaSrcVectorSize, index_t BetaSrcVectorDim, index_t BetaSrcVectorSize, index_t YDstVectorDim, index_t YDstVectorSize, index_t SaveMeanInvStdDstVectorSize, bool UseWelford>
auto NormalizationKernelSelector (bool isSweepOnce)
template<typename T>
__device__ T * cast_pointer_to_generic_address_space (T CK_CONSTANT_ADDRESS_SPACE *p)
template<typename T>
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACEcast_pointer_to_constant_address_space (T *p)
template<typename T>
__device__ int32x4_t make_wave_buffer_resource (T *p_wave, index_t element_space_size)
template<typename T>
__device__ int32x4_t make_wave_buffer_resource_with_default_range (T *p_wave)
__device__ int8_t llvm_amdgcn_raw_buffer_load_i8 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
__device__ int8x2_t llvm_amdgcn_raw_buffer_load_i8x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
__device__ int8x4_t llvm_amdgcn_raw_buffer_load_i8x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
__device__ bhalf_t llvm_amdgcn_raw_buffer_load_i16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
__device__ bhalf2_t llvm_amdgcn_raw_buffer_load_i16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
__device__ bhalf4_t llvm_amdgcn_raw_buffer_load_i16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
__device__ int32_t llvm_amdgcn_raw_buffer_load_i32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
__device__ int32x2_t llvm_amdgcn_raw_buffer_load_i32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
__device__ int32x4_t llvm_amdgcn_raw_buffer_load_i32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
__device__ half_t llvm_amdgcn_raw_buffer_load_fp16 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
__device__ half2_t llvm_amdgcn_raw_buffer_load_fp16x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
__device__ half4_t llvm_amdgcn_raw_buffer_load_fp16x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
__device__ float llvm_amdgcn_raw_buffer_load_fp32 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
__device__ float2_t llvm_amdgcn_raw_buffer_load_fp32x2 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
__device__ float4_t llvm_amdgcn_raw_buffer_load_fp32x4 (int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
__device__ void llvm_amdgcn_raw_buffer_store_i8 (int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
__device__ void llvm_amdgcn_raw_buffer_store_i8x2 (int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
__device__ void llvm_amdgcn_raw_buffer_store_i8x4 (int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
__device__ void llvm_amdgcn_raw_buffer_store_i16 (bhalf_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
__device__ void llvm_amdgcn_raw_buffer_store_i16x2 (bhalf2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
__device__ void llvm_amdgcn_raw_buffer_store_i16x4 (bhalf4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
__device__ void llvm_amdgcn_raw_buffer_store_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
__device__ void llvm_amdgcn_raw_buffer_store_i32x2 (int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
__device__ void llvm_amdgcn_raw_buffer_store_i32x4 (int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
__device__ void llvm_amdgcn_raw_buffer_store_fp16 (half_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
__device__ void llvm_amdgcn_raw_buffer_store_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
__device__ void llvm_amdgcn_raw_buffer_store_fp16x4 (half4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
__device__ void llvm_amdgcn_raw_buffer_store_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
__device__ void llvm_amdgcn_raw_buffer_store_fp32x2 (float2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
__device__ void llvm_amdgcn_raw_buffer_store_fp32x4 (float4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2 (half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32 (int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32 (float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64 (double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< T, N >::type amd_buffer_load_impl (int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl_raw (const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N>
__device__ void amd_global_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, T *addr)
template<typename T, index_t N>
__device__ void amd_buffer_atomic_add_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N>
__device__ void amd_buffer_atomic_max_impl (const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_zero (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_customized_value (const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
template<typename T, index_t N>
__device__ void amd_buffer_atomic_add (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
template<typename T, index_t N>
__device__ void amd_buffer_atomic_max (const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
__device__ void llvm_amdgcn_raw_buffer_load_lds (int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
template<typename T, index_t NumElemsPerThread>
__device__ void amd_direct_load_global_to_lds (const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
template<typename T>
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new (T *p_wave, index_t element_space_size)
template<typename T>
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_with_default_range_new (T *p_wave)
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw (__amdgpu_buffer_rsrc_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< T, N >::type amd_buffer_load_impl (__amdgpu_buffer_rsrc_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl_raw (const typename vector_type< int8_t, N >::type src_thread_data, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void amd_buffer_store_impl (const typename vector_type< T, N >::type src_thread_data, __amdgpu_buffer_rsrc_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
template<>
__host__ __device__ constexpr bool fp8_is_nan (f8_ocp_t a)
template<>
__host__ __device__ constexpr bool fp8_is_nan (bf8_ocp_t a)
template<>
__host__ __device__ constexpr bool fp8_is_nan (f8_fnuz_t a)
template<>
__host__ __device__ constexpr bool fp8_is_nan (bf8_fnuz_t a)
template<>
__host__ __device__ constexpr bool fp8_is_inf (bf8_ocp_t a)
__device__ int amd_assembly_and_b32 (int a, int b)
__device__ int amd_assembly_and_or_b32 (int a, int b, int d)
__device__ half2_t amd_assembly_pk_fma_f16 (half2_t a, half2_t b, half2_t c)
__device__ half2_t amd_assembly_pk_add_f16 (half2_t a, half2_t b)
__device__ float amd_assemble_cvt_f32_i4 (int b)
__device__ f8x4_t amd_assembly_cvt_f8_to_f32 (float b0, float b1, float b2, float b3)
__device__ f8x8_t amd_assembly_i4_to_fp8x8 (int a)
__device__ void amd_assembly_outer_product_1x2 (float a, float b0, float b1, float &c0, float &c1)
__device__ void amd_assembly_outer_product_1x4 (float a, float b0, float b1, float b2, float b3, float &c0, float &c1, float &c2, float &c3)
__device__ void amd_assembly_outer_product_1x2 (half2_t a, half2_t b0, half2_t b1, float &c0, float &c1)
__device__ void amd_assembly_outer_product_1x4 (half2_t a, half2_t b0, half2_t b1, half2_t b2, half2_t b3, float &c0, float &c1, float &c2, float &c3)
__device__ void amd_assembly_outer_product_1x2 (int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t &c0, int32_t &c1)
__device__ void amd_assembly_outer_product_1x4 (int8x4_t a, int8x4_t b0, int8x4_t b1, int8x4_t b2, int8x4_t b3, int32_t &c0, int32_t &c1, int32_t &c2, int32_t &c3)
__device__ uint32_t amd_wave_read_first_lane (uint32_t value)
__device__ int32_t amd_wave_read_first_lane (int32_t value)
__device__ int64_t amd_wave_read_first_lane (int64_t value)
template<typename Object, typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>>
__device__ auto amd_wave_read_first_lane (const Object &obj)
template<typename X, typename... Xs>
__host__ __device__ constexpr auto make_array (X &&x, Xs &&... xs)
template<typename X>
__host__ __device__ constexpr auto make_array ()
template<typename... Xs>
__host__ __device__ constexpr auto make_multi_index (Xs &&... xs)
template<index_t NSize>
__host__ __device__ constexpr auto make_zero_multi_index ()
template<typename T>
__host__ __device__ constexpr auto to_multi_index (const T &x)
template<index_t NSize, typename X>
__host__ __device__ constexpr auto operator+= (MultiIndex< NSize > &y, const X &x)
template<index_t NSize, typename X>
__host__ __device__ constexpr auto operator-= (MultiIndex< NSize > &y, const X &x)
template<index_t NSize, typename T>
__host__ __device__ constexpr auto operator+ (const MultiIndex< NSize > &a, const T &b)
template<index_t NSize, typename T>
__host__ __device__ constexpr auto operator- (const MultiIndex< NSize > &a, const T &b)
template<index_t NSize, typename T>
__host__ __device__ constexpr auto operator* (const MultiIndex< NSize > &a, const T &b)
template<typename PY, typename PX, typename enable_if< is_pointer_v< PY > &&is_pointer_v< PX >, bool >::type = false>
__host__ __device__ PY c_style_pointer_cast (PX p_x)
template<typename Arr, typename Picks, typename X>
__host__ __device__ constexpr auto operator+= (ContainerElementPicker< Arr, Picks > &y, const X &x)
template<typename Arr, typename Picks, typename X>
__host__ __device__ constexpr auto operator-= (ContainerElementPicker< Arr, Picks > &y, const X &x)
template<typename Arr, typename Picks>
__host__ __device__ constexpr auto pick_container_element (Arr &a, Picks)
template<typename Arr, typename Picks>
__host__ __device__ constexpr auto pick_container_element (const Arr &a, Picks)
template<typename TData, index_t NSize>
__host__ __device__ constexpr auto container_push_back (const Array< TData, NSize > &a, const TData &x)
template<typename... Ts, typename T>
__host__ __device__ constexpr auto container_push_front (const Tuple< Ts... > &a, const T &x)
template<typename... Ts, typename T>
__host__ __device__ constexpr auto container_push_back (const Tuple< Ts... > &a, const T &x)
template<typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_new2old (const Array< TData, NSize > &old_array, Sequence< IRs... >)
template<typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_old2new (const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
template<typename... Ts, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_new2old (const Tuple< Ts... > &old_tuple, Sequence< IRs... >)
template<typename... Ts, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_old2new (const Tuple< Ts... > &old_tuple, Sequence< IRs... > old2new)
template<index_t... Is, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_new2old (Sequence< Is... >, Sequence< IRs... >)
template<index_t... Is, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_old2new (Sequence< Is... > old_seq, Sequence< IRs... >)
template<typename Container, typename Reduce, typename Init, index_t IBegin = 0, index_t IEnd = Container::Size(), index_t IStep = 1>
__host__ __device__ constexpr auto container_reduce (const Container &x, Reduce reduce, Init init, Number< IBegin >=Number< 0 >{}, Number< IEnd >=Number< Container::Size()>{}, Number< IStep >=Number< 1 >{})
template<typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto container_reverse_inclusive_scan (const Array< TData, NSize > &x, Reduce f, TData init)
template<typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto container_reverse_exclusive_scan (const Array< TData, NSize > &x, Reduce f, TData init)
template<index_t... Is, typename Reduce, index_t Init>
__host__ __device__ constexpr auto container_reverse_exclusive_scan (const Sequence< Is... > &seq, Reduce f, Number< Init >)
template<typename... Xs, typename Reduce, typename Init>
__host__ __device__ constexpr auto container_reverse_exclusive_scan (const Tuple< Xs... > &x, Reduce reduce, Init init)
template<typename... Xs, typename Reduce, typename TData>
__host__ __device__ constexpr auto container_reverse_inclusive_scan (const Tuple< Xs... > &x, Reduce f, TData init)
template<typename X, typename... Ys>
__host__ __device__ constexpr auto container_concat (const X &x, const Ys &... ys)
template<typename T, index_t NX, index_t NY>
__host__ __device__ constexpr auto container_concat (const Array< T, NX > &ax, const Array< T, NY > &ay)
template<typename... X, typename... Y>
__host__ __device__ constexpr auto container_concat (const Tuple< X... > &tx, const Tuple< Y... > &ty)
template<typename Container>
__host__ __device__ constexpr auto container_concat (const Container &x)
template<typename T, index_t N, index_t... Is>
__host__ __device__ constexpr auto get_container_subset (const Array< T, N > &arr, Sequence< Is... >)
template<typename... Ts, index_t... Is>
__host__ __device__ constexpr auto get_container_subset (const Tuple< Ts... > &tup, Sequence< Is... >)
template<typename T, index_t N, index_t... Is>
__host__ __device__ constexpr void set_container_subset (Array< T, N > &y, Sequence< Is... > picks, const Array< T, sizeof...(Is)> &x)
template<typename... Ys, index_t... Is, typename... Xs>
__host__ __device__ constexpr void set_container_subset (Tuple< Ys... > &y, Sequence< Is... > picks, const Tuple< Xs... > &x)
template<index_t... Is>
__host__ __device__ constexpr auto sequence_to_tuple_of_number (Sequence< Is... >)
constexpr auto next_pow2 (uint32_t x)
template<typename T>
constexpr bool is_native_type ()
template<typename T>
const char * get_type_name ()
template<typename T, index_t N>
__host__ __device__ constexpr auto make_vector_type (Number< N >)
template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T, typename ElementSpaceSize>
__host__ __device__ constexpr auto make_dynamic_buffer (T *p, ElementSpaceSize element_space_size)
template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T, typename ElementSpaceSize>
__host__ __device__ constexpr auto make_long_dynamic_buffer (T *p, ElementSpaceSize element_space_size)
template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T, typename ElementSpaceSize, typename X, typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X > >::value, bool >::type = false>
__host__ __device__ constexpr auto make_dynamic_buffer (T *p, ElementSpaceSize element_space_size, X invalid_element_value)
template<class EnvVar>
const std::string & EnvGetString (EnvVar)
template<class EnvVar>
bool EnvIsEnabled (EnvVar)
template<class EnvVar>
bool EnvIsDisabled (EnvVar)
template<class EnvVar>
uint64_t EnvValue (EnvVar)
template<class EnvVar>
bool EnvIsUnset (EnvVar)
template<class EnvVar>
void EnvUnset (EnvVar)
template<typename EnvVar, typename ValueType>
void UpdateEnvVar (EnvVar, const ValueType &val)
 updates the cached value of an environment variable
template<typename EnvVar>
void UpdateEnvVar (EnvVar, const std::string_view &val)
__host__ int clz (uint32_t x)
template<bool predicate, typename X, typename Y>
constexpr auto conditional_expr (X &&x, Y &&y)
template<typename F, typename X>
__host__ __device__ constexpr auto unpack (F &&f, X &&x)
template<typename F, typename X, typename Y>
__host__ __device__ constexpr auto unpack2 (F &&f, X &&x, Y &&y)
template<typename X>
__device__ X atomic_add (X *p_dst, const X &x)
template<>
__device__ int32_t atomic_add< int32_t > (int32_t *p_dst, const int32_t &x)
template<>
__device__ uint32_t atomic_add< uint32_t > (uint32_t *p_dst, const uint32_t &x)
template<>
__device__ float atomic_add< float > (float *p_dst, const float &x)
template<>
__device__ unsigned short atomic_add< unsigned short > (unsigned short *p_dst, const unsigned short &x)
template<>
__device__ _Float16 atomic_add< _Float16 > (_Float16 *p_dst, const _Float16 &x)
template<>
__device__ double atomic_add< double > (double *p_dst, const double &x)
template<>
__device__ float2_t atomic_add< float2_t > (float2_t *p_dst, const float2_t &x)
template<>
__device__ double2_t atomic_add< double2_t > (double2_t *p_dst, const double2_t &x)
template<typename X>
__device__ X atomic_max (X *p_dst, const X &x)
template<>
__device__ int32_t atomic_max< int32_t > (int32_t *p_dst, const int32_t &x)
template<>
__device__ uint32_t atomic_max< uint32_t > (uint32_t *p_dst, const uint32_t &x)
template<>
__device__ float atomic_max< float > (float *p_dst, const float &x)
template<>
__device__ double atomic_max< double > (double *p_dst, const double &x)
template<>
__device__ float2_t atomic_max< float2_t > (float2_t *p_dst, const float2_t &x)
__device__ constexpr index_t get_warp_size ()
__device__ index_t get_thread_local_1d_id ()
__device__ index_t get_thread_global_1d_id ()
__device__ index_t get_warp_local_1d_id ()
__device__ index_t get_block_1d_id ()
__device__ index_t get_grid_size ()
__device__ index_t get_block_size ()
template<>
constexpr __device__ index_t get_shift< 1 > ()
template<typename TA, typename TB, typename TC>
__device__ void inner_product (const TA &a, const TB &b, TC &c)
template<>
__device__ void inner_product< float, float, float > (const float &a, const float &b, float &c)
template<>
__device__ void inner_product< float2_t, float2_t, float > (const float2_t &a, const float2_t &b, float &c)
template<>
__device__ void inner_product< float4_t, float4_t, float > (const float4_t &a, const float4_t &b, float &c)
template<>
__device__ void inner_product< bhalf_t, bhalf_t, float > (const bhalf_t &a, const bhalf_t &b, float &c)
template<>
__device__ void inner_product< half_t, half_t, float > (const half_t &a, const half_t &b, float &c)
template<>
__device__ void inner_product< half2_t, half2_t, float > (const half2_t &a, const half2_t &b, float &c)
template<>
__device__ void inner_product< half4_t, half4_t, float > (const half4_t &a, const half4_t &b, float &c)
template<>
__device__ void inner_product< half8_t, half8_t, float > (const half8_t &a, const half8_t &b, float &c)
template<>
__device__ void inner_product< int8_t, int8_t, int32_t > (const int8_t &a, const int8_t &b, int32_t &c)
template<>
__device__ void inner_product< int8x2_t, int8x2_t, int32_t > (const int8x2_t &a, const int8x2_t &b, int32_t &c)
template<>
__device__ void inner_product< int8x4_t, int8x4_t, int32_t > (const int8x4_t &a, const int8x4_t &b, int32_t &c)
template<>
__device__ void inner_product< int8x8_t, int8x8_t, int32_t > (const int8x8_t &a, const int8x8_t &b, int32_t &c)
template<>
__device__ void inner_product< int8x16_t, int8x16_t, int32_t > (const int8x16_t &a, const int8x16_t &b, int32_t &c)
template<typename TX, TX X, typename TY, TY Y>
__host__ __device__ constexpr auto operator+ (integral_constant< TX, X >, integral_constant< TY, Y >)
template<typename TX, TX X, typename TY, TY Y>
__host__ __device__ constexpr auto operator- (integral_constant< TX, X >, integral_constant< TY, Y >)
template<typename TX, TX X, typename TY, TY Y>
__host__ __device__ constexpr auto operator* (integral_constant< TX, X >, integral_constant< TY, Y >)
template<typename TX, TX X, typename TY, TY Y>
__host__ __device__ constexpr auto operator/ (integral_constant< TX, X >, integral_constant< TY, Y >)
template<typename TX, TX X, typename TY, TY Y>
__host__ __device__ constexpr auto operator% (integral_constant< TX, X >, integral_constant< TY, Y >)
constexpr LoopScheduler make_default_loop_scheduler ()
template<typename Y, typename X>
__host__ __device__ constexpr Y mxf8_convert_sr (X x, float scale)
template<typename Y, typename X>
__host__ __device__ constexpr Y mxf8_convert_rne (X x, float scale)
template<>
__host__ __device__ f8_ocp_t mxf8_convert_rne< f8_ocp_t, float > (float x, float scale)
template<>
__host__ __device__ bf8_ocp_t mxf8_convert_rne< bf8_ocp_t, float > (float x, float scale)
template<>
__host__ __device__ f8x2_ocp_t mxf8_convert_rne< f8x2_ocp_t, float2_t > (float2_t x, float scale)
template<>
__host__ __device__ bf8x2_ocp_t mxf8_convert_rne< bf8x2_ocp_t, float2_t > (float2_t x, float scale)
template<>
__host__ __device__ f8x16_ocp_t mxf8_convert_rne< f8x16_ocp_t, float16_t > (float16_t x, float scale)
template<>
__host__ __device__ bf8x16_ocp_t mxf8_convert_rne< bf8x16_ocp_t, float16_t > (float16_t x, float scale)
template<>
__host__ __device__ f8x32_ocp_t mxf8_convert_rne< f8x32_ocp_t, float32_t > (float32_t x, float scale)
template<>
__host__ __device__ bf8x32_ocp_t mxf8_convert_rne< bf8x32_ocp_t, float32_t > (float32_t x, float scale)
template<>
__host__ __device__ f8_ocp_t mxf8_convert_sr< f8_ocp_t, float > (float x, float scale)
template<>
__host__ __device__ bf8_ocp_t mxf8_convert_sr< bf8_ocp_t, float > (float x, float scale)
template<>
__host__ __device__ f8x2_ocp_t mxf8_convert_sr< f8x2_ocp_t, float2_t > (float2_t x, float scale)
template<>
__host__ __device__ bf8x2_ocp_t mxf8_convert_sr< bf8x2_ocp_t, float2_t > (float2_t x, float scale)
template<>
__host__ __device__ f8x16_ocp_t mxf8_convert_sr< f8x16_ocp_t, float16_t > (float16_t x, float scale)
template<>
__host__ __device__ bf8x16_ocp_t mxf8_convert_sr< bf8x16_ocp_t, float16_t > (float16_t x, float scale)
template<>
__host__ __device__ f8x32_ocp_t mxf8_convert_sr< f8x32_ocp_t, float32_t > (float32_t x, float scale)
template<>
__host__ __device__ bf8x32_ocp_t mxf8_convert_sr< bf8x32_ocp_t, float32_t > (float32_t x, float scale)
template<typename T, uint32_t seed_t, ck::enable_if_t< is_same< float, T >{}, bool > = false>
__host__ __device__ uint32_t prand_generator (index_t id, T val, uint32_t seed=seed_t)
template<typename T, uint32_t seed_t, ck::enable_if_t< is_same< _Float16, T >{}, bool > = false>
__host__ __device__ uint32_t prand_generator (index_t id, T val, uint32_t seed=seed_t)
template<typename T, uint32_t seed_t, ck::enable_if_t<!(is_same< float, T >{}||is_same< _Float16, T >{}), bool > = false>
__host__ __device__ uint32_t prand_generator (int id, T val, uint32_t seed=seed_t)
template<typename Y, typename X>
__host__ constexpr Y scaled_type_convert (e8m0_bexp_t scale, X x)
template<>
__host__ float scaled_type_convert< float, f8_ocp_t > (e8m0_bexp_t scale, f8_ocp_t x)
template<>
__host__ float scaled_type_convert< float, bf8_ocp_t > (e8m0_bexp_t scale, bf8_ocp_t x)
template<>
__host__ float2_t scaled_type_convert< float2_t, f8x2_ocp_t > (e8m0_bexp_t scale, f8x2_ocp_t x)
template<>
__host__ float2_t scaled_type_convert< float2_t, bf8x2_ocp_t > (e8m0_bexp_t scale, bf8x2_ocp_t x)
template<>
__host__ float16_t scaled_type_convert< float16_t, f8x16_ocp_t > (e8m0_bexp_t scale, f8x16_ocp_t x)
template<>
__host__ float16_t scaled_type_convert< float16_t, bf8x16_ocp_t > (e8m0_bexp_t scale, bf8x16_ocp_t x)
template<>
__host__ float32_t scaled_type_convert< float32_t, f8x32_ocp_t > (e8m0_bexp_t scale, f8x32_ocp_t x)
template<>
__host__ float32_t scaled_type_convert< float32_t, bf8x32_ocp_t > (e8m0_bexp_t scale, bf8x32_ocp_t x)
template<>
__host__ f8_ocp_t scaled_type_convert< f8_ocp_t, float > (e8m0_bexp_t scale, float x)
template<>
__host__ bf8_ocp_t scaled_type_convert< bf8_ocp_t, float > (e8m0_bexp_t scale, float x)
template<>
__host__ f8x2_ocp_t scaled_type_convert< f8x2_ocp_t, float2_t > (e8m0_bexp_t scale, float2_t x)
template<>
__host__ bf8x2_ocp_t scaled_type_convert< bf8x2_ocp_t, float2_t > (e8m0_bexp_t scale, float2_t x)
template<>
__host__ f8x16_ocp_t scaled_type_convert< f8x16_ocp_t, float16_t > (e8m0_bexp_t scale, float16_t x)
template<>
__host__ bf8x16_ocp_t scaled_type_convert< bf8x16_ocp_t, float16_t > (e8m0_bexp_t scale, float16_t x)
template<>
__host__ f8x32_ocp_t scaled_type_convert< f8x32_ocp_t, float32_t > (e8m0_bexp_t scale, float32_t x)
template<>
__host__ bf8x32_ocp_t scaled_type_convert< bf8x32_ocp_t, float32_t > (e8m0_bexp_t scale, float32_t x)
template<index_t I, index_t... Is>
__host__ __device__ constexpr auto sequence_pop_front (Sequence< I, Is... >)
template<typename Seq>
__host__ __device__ constexpr auto sequence_pop_back (Seq)
template<index_t... Xs, index_t... Ys>
__host__ __device__ constexpr bool operator== (Sequence< Xs... >, Sequence< Ys... >)
template<index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto operator+ (Sequence< Xs... >, Sequence< Ys... >)
template<index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto operator- (Sequence< Xs... >, Sequence< Ys... >)
template<index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto operator* (Sequence< Xs... >, Sequence< Ys... >)
template<index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto operator/ (Sequence< Xs... >, Sequence< Ys... >)
template<index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto operator% (Sequence< Xs... >, Sequence< Ys... >)
template<index_t... Xs, index_t Y>
__host__ __device__ constexpr auto operator+ (Sequence< Xs... >, Number< Y >)
template<index_t... Xs, index_t Y>
__host__ __device__ constexpr auto operator- (Sequence< Xs... >, Number< Y >)
template<index_t... Xs, index_t Y>
__host__ __device__ constexpr auto operator* (Sequence< Xs... >, Number< Y >)
template<index_t... Xs, index_t Y>
__host__ __device__ constexpr auto operator/ (Sequence< Xs... >, Number< Y >)
template<index_t... Xs, index_t Y>
__host__ __device__ constexpr auto operator% (Sequence< Xs... >, Number< Y >)
template<index_t Y, index_t... Xs>
__host__ __device__ constexpr auto operator+ (Number< Y >, Sequence< Xs... >)
template<index_t Y, index_t... Xs>
__host__ __device__ constexpr auto operator- (Number< Y >, Sequence< Xs... >)
template<index_t Y, index_t... Xs>
__host__ __device__ constexpr auto operator* (Number< Y >, Sequence< Xs... >)
template<index_t Y, index_t... Xs>
__host__ __device__ constexpr auto operator/ (Number< Y >, Sequence< Xs... >)
template<index_t Y, index_t... Xs>
__host__ __device__ constexpr auto operator% (Number< Y >, Sequence< Xs... >)
template<typename... Seqs>
__host__ __device__ constexpr auto merge_sequences (Seqs...)
template<typename F, index_t... Xs>
__host__ __device__ constexpr auto transform_sequences (F f, Sequence< Xs... >)
template<typename F, index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto transform_sequences (F f, Sequence< Xs... >, Sequence< Ys... >)
template<typename F, index_t... Xs, index_t... Ys, index_t... Zs>
__host__ __device__ constexpr auto transform_sequences (F f, Sequence< Xs... >, Sequence< Ys... >, Sequence< Zs... >)
template<typename Seq, typename Reduce, index_t Init>
__host__ __device__ constexpr auto reverse_inclusive_scan_sequence (Seq, Reduce, Number< Init >)
template<typename Seq, typename Reduce, index_t Init>
__host__ __device__ constexpr auto reverse_exclusive_scan_sequence (Seq, Reduce, Number< Init >)
template<typename Seq, typename Reduce, index_t Init>
__host__ __device__ constexpr auto inclusive_scan_sequence (Seq, Reduce, Number< Init >)
template<typename Seq, index_t... Is>
__host__ __device__ constexpr auto pick_sequence_elements_by_ids (Seq, Sequence< Is... >)
template<typename Seq, typename Mask>
__host__ __device__ constexpr auto pick_sequence_elements_by_mask (Seq, Mask)
template<typename Seq, typename Values, typename Ids>
__host__ __device__ constexpr auto modify_sequence_elements_by_ids (Seq, Values, Ids)
template<typename Seq, typename Reduce, index_t Init>
__host__ __device__ constexpr index_t reduce_on_sequence (Seq, Reduce f, Number< Init >)
template<typename Seq, typename F>
__host__ __device__ constexpr bool sequence_any_of (Seq, F f)
template<typename Seq, typename F>
__host__ __device__ constexpr bool sequence_all_of (Seq, F f)
template<index_t... Is>
__host__ __device__ constexpr auto make_sequence (Number< Is >...)
template<typename F, index_t N>
__host__ __device__ constexpr auto generate_sequence (F, Number< N >)
template<typename F, index_t N>
__host__ __device__ constexpr auto generate_sequence_v2 (F &&f, Number< N >)
template<index_t... Is>
__host__ __device__ constexpr auto to_sequence (Tuple< Number< Is >... >)
template<AddressSpaceEnum AddressSpace, typename T, index_t N>
__host__ __device__ constexpr auto make_static_buffer (Number< N >)
template<AddressSpaceEnum AddressSpace, typename T, long_index_t N>
__host__ __device__ constexpr auto make_static_buffer (LongNumber< N >)
template<typename X, typename... Xs>
__host__ __device__ constexpr auto make_statically_indexed_array (const X &x, const Xs &... xs)
template<typename X>
__host__ __device__ constexpr auto make_statically_indexed_array ()
template<typename... Ys, typename X, enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false>
__host__ __device__ constexpr auto operator+= (Tuple< Ys... > &y, const X &x)
template<typename... Ys, typename X, enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false>
__host__ __device__ constexpr auto operator-= (Tuple< Ys... > &y, const X &x)
template<typename... Xs, typename Y, enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ __device__ constexpr auto operator+ (const Tuple< Xs... > &x, const Y &y)
template<typename... Xs, typename Y, enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ __device__ constexpr auto operator- (const Tuple< Xs... > &x, const Y &y)
template<typename... Xs, typename Y, enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ __device__ constexpr auto operator* (const Tuple< Xs... > &x, const Y &y)
template<typename... Xs, typename Y, enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false>
__host__ __device__ constexpr auto operator* (Y a, const Tuple< Xs... > &x)
template<typename... Xs, typename Y, enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false>
__host__ __device__ constexpr auto operator* (const Tuple< Xs... > &x, Y a)
template<typename... Xs>
__host__ __device__ void print_multi_index (const Tuple< Xs... > &x)
__device__ void block_sync_lds ()
__device__ void block_sync_lds_direct_load ()
__device__ void s_nop ()
__device__ void transpose_fp16_2x2 (const half2_t &x0, const half2_t &x1, half2_t &y0, half2_t &y1)
__device__ void transpose_int8_4x4 (const int8x4_t &x0, const int8x4_t &x1, const int8x4_t &x2, const int8x4_t &x3, int8x4_t &y0, int8x4_t &y1, int8x4_t &y2, int8x4_t &y3)
__device__ void transpose_f8_4x4 (const f8x4_t &x0, const f8x4_t &x1, const f8x4_t &x2, const f8x4_t &x3, f8x4_t &y0, f8x4_t &y1, f8x4_t &y2, f8x4_t &y3)
template<typename... Xs>
__host__ __device__ constexpr auto make_tuple (Xs &&... xs)
template<typename... Args>
constexpr Tuple< Args &... > tie (Args &... args) noexcept
template<typename F, index_t... ids>
__host__ __device__ constexpr auto generate_tuple_for (F &&f, Sequence< ids... >)
template<typename F, index_t N>
__host__ __device__ constexpr auto generate_tuple (F &&f, Number< N >)
template<typename F, index_t N>
__host__ __device__ constexpr auto generate_tuple (F &&f, LongNumber< N >)
template<typename F, index_t N>
__host__ __device__ constexpr auto generate_tie (F &&f, Number< N >)
template<typename... X, typename... Y>
__host__ __device__ constexpr auto concat_tuple_of_reference (const Tuple< X &... > &tx, const Tuple< Y &... > &ty)
template<typename... X, typename... Y>
__host__ __device__ constexpr auto concat_tuple (const Tuple< X... > &tx, const Tuple< Y... > &ty)
template<typename... X>
__host__ __device__ constexpr auto concat_tuple (const Tuple< X... > &tx)
template<typename... X, typename... Tuples>
__host__ __device__ constexpr auto concat_tuple (const Tuple< X... > &tx, const Tuples &... tuples)
template<typename F, typename X>
__host__ __device__ constexpr auto transform_tuples (F f, const X &x)
template<typename F, typename X, typename Y>
__host__ __device__ constexpr auto transform_tuples (F f, const X &x, const Y &y)
template<typename F, typename X, typename Y, typename Z>
__host__ __device__ constexpr auto transform_tuples (F f, const X &x, const Y &y, const Z &z)
template<index_t Depth = 0, index_t MaxDepth = -1>
__host__ __device__ constexpr auto UnrollNestedTuple (const Tuple<> &element)
template<index_t Depth = 0, index_t MaxDepth = -1, typename T>
__host__ __device__ constexpr auto UnrollNestedTuple (const T &element)
template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts>
__host__ __device__ constexpr auto UnrollNestedTuple (const Tuple< Ts... > &tuple)
template<typename... Ts>
__host__ __device__ constexpr auto TupleReverse (const Tuple< Ts... > &tuple)
template<index_t Idx, index_t End, typename F, typename... Ts>
__host__ __device__ constexpr auto TupleReduce (F &&f, const Tuple< Ts... > &tuple)
template<typename... Ts>
__host__ __device__ constexpr auto IsNestedTuple (const Tuple< Ts... > &)
template<index_t depth = 0, typename T>
__host__ __device__ constexpr auto TupleDepth (const T &)
template<index_t depth = 0, typename... Ts>
__host__ __device__ constexpr auto TupleDepth (const Tuple< Ts... > &)
template<index_t from, index_t to, typename... Ts>
__host__ __device__ constexpr auto TupleSlice (const Tuple< Ts... > &tuple)
template<typename Y, typename X, typename enable_if< sizeof(X)==sizeof(Y), bool >::type = false>
__host__ __device__ constexpr Y bit_cast (const X &x)
template<typename Y, typename X>
__host__ __device__ constexpr Y bf16_convert_rtn (X x)
template<>
__host__ __device__ constexpr bhalf_t bf16_convert_rtn< bhalf_t, float > (float x)
template<>
__host__ __device__ constexpr bhalf_t bf16_convert_rtn< bhalf_t, half_t > (half_t x)
template<typename Y, typename X, ck::enable_if_t<!(ck::is_const_v< Y >||ck::is_const_v< X >), bool > = false>
__host__ __device__ constexpr Y type_convert (X x)
template<typename Y, typename X, ck::enable_if_t< ck::is_const_v< Y >||ck::is_const_v< X >, bool > = false>
__host__ __device__ constexpr Y type_convert (X x)
template<>
__host__ __device__ constexpr float type_convert< float, bhalf_t > (bhalf_t x)
template<>
__host__ __device__ constexpr bhalf_t type_convert< bhalf_t, float > (float x)
template<>
__host__ __device__ constexpr half_t type_convert< half_t, bhalf_t > (bhalf_t x)
template<>
__host__ __device__ constexpr bhalf_t type_convert< bhalf_t, half_t > (half_t x)
template<>
__host__ __device__ constexpr int8_t type_convert< int8_t, bhalf_t > (bhalf_t x)
template<>
__host__ __device__ constexpr bhalf_t type_convert< bhalf_t, int8_t > (int8_t x)
template<>
__host__ __device__ constexpr f8_ocp_t type_convert< f8_ocp_t, int > (int x)
template<>
__host__ __device__ constexpr bf8_ocp_t type_convert< bf8_ocp_t, int > (int x)
template<typename Y, enable_if_t< is_same_v< Y, ck::tf32_t >, bool > = false>
__host__ __device__ constexpr float type_convert (float x)
template<typename Y, typename X>
__host__ __device__ constexpr Y type_convert_sp (X x)
template<>
__host__ __device__ constexpr int type_convert_sp< int, float > (float x)
template<>
__host__ __device__ constexpr float type_convert_sp< float, int > (int x)
template<>
__host__ __device__ constexpr int type_convert_sp< int, half_t > (half_t x)
template<>
__host__ __device__ constexpr half_t type_convert_sp< half_t, int > (int x)
template<>
__host__ __device__ constexpr int type_convert_sp< int, f8_t > (f8_t x)
template<>
__host__ __device__ constexpr f8_t type_convert_sp< f8_t, int > (int x)
template<>
__host__ __device__ constexpr int type_convert_sp< int, bhalf_t > (bhalf_t x)
template<>
__host__ __device__ constexpr bhalf_t type_convert_sp< bhalf_t, int > (int x)
template<>
__host__ __device__ constexpr bhalf_t type_convert_sp< bhalf_t, float > (float x)
template<>
__host__ __device__ constexpr half_t type_convert_sp< half_t, float > (float x)
template<typename Y, typename X>
__host__ __device__ constexpr Y f8_convert_sr (X x)
template<>
__host__ __device__ f8_fnuz_t f8_convert_sr< f8_fnuz_t, float > (float x)
template<>
__host__ __device__ f8_fnuz_t f8_convert_sr< f8_fnuz_t, half_t > (half_t x)
template<>
__host__ __device__ bf8_fnuz_t f8_convert_sr< bf8_fnuz_t, float > (float x)
template<>
__host__ __device__ bf8_fnuz_t f8_convert_sr< bf8_fnuz_t, half_t > (half_t x)
template<>
__host__ __device__ f8_ocp_t f8_convert_sr< f8_ocp_t, float > (float x)
 Converts a float to a 8-bit float type (f8_ocp_t) using stochastic rounding.
template<>
__host__ __device__ f8x2_ocp_t f8_convert_sr< f8x2_ocp_t, float2_t > (float2_t x)
 Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.
template<>
__host__ __device__ bf8_ocp_t f8_convert_sr< bf8_ocp_t, float > (float x)
 Converts a float to a 8-bit float type (bf8_ocp_t) using stochastic rounding.
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_sr< bf8x2_ocp_t, float2_t > (float2_t x)
 Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.
template<>
__host__ __device__ f8_ocp_t f8_convert_sr< f8_ocp_t, half_t > (half_t x)
 Converts a half_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.
template<>
__host__ __device__ f8x2_ocp_t f8_convert_sr< f8x2_ocp_t, half2_t > (half2_t x)
 Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.
template<>
__host__ __device__ bf8_ocp_t f8_convert_sr< bf8_ocp_t, half_t > (half_t x)
 Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_sr< bf8x2_ocp_t, half2_t > (half2_t x)
 Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.
template<>
__host__ __device__ f8_ocp_t f8_convert_sr< f8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.
template<>
__host__ __device__ f8x2_ocp_t f8_convert_sr< f8x2_ocp_t, bhalf2_t > (bhalf2_t x)
 Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.
template<>
__host__ __device__ bf8_ocp_t f8_convert_sr< bf8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_sr< bf8x2_ocp_t, bhalf2_t > (bhalf2_t x)
 Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.
template<typename Y, typename X>
__host__ __device__ constexpr Y f8_convert_rne (X x)
template<>
__host__ __device__ f8_fnuz_t f8_convert_rne< f8_fnuz_t, float > (float x)
template<>
__host__ __device__ f8_fnuz_t f8_convert_rne< f8_fnuz_t, half_t > (half_t x)
template<>
__host__ __device__ bf8_fnuz_t f8_convert_rne< bf8_fnuz_t, float > (float x)
template<>
__host__ __device__ bf8_fnuz_t f8_convert_rne< bf8_fnuz_t, half_t > (half_t x)
template<>
__host__ __device__ f8_ocp_t f8_convert_rne< f8_ocp_t, float > (float x)
 Converts a float to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.
template<>
__host__ __device__ f8x2_ocp_t f8_convert_rne< f8x2_ocp_t, float2_t > (float2_t x)
 Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.
template<>
__host__ __device__ bf8_ocp_t f8_convert_rne< bf8_ocp_t, float > (float x)
 Converts a float to a 8-bit float type (bf8_ocp_t) using rounding to nearest/even.
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_rne< bf8x2_ocp_t, float2_t > (float2_t x)
 Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.
template<>
__host__ __device__ f8_ocp_t f8_convert_rne< f8_ocp_t, half_t > (half_t x)
 Converts a half_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.
template<>
__host__ __device__ f8x2_ocp_t f8_convert_rne< f8x2_ocp_t, half2_t > (half2_t x)
 Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.
template<>
__host__ __device__ bf8_ocp_t f8_convert_rne< bf8_ocp_t, half_t > (half_t x)
 Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_rne< bf8x2_ocp_t, half2_t > (half2_t x)
 Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.
template<>
__host__ __device__ f8_ocp_t f8_convert_rne< f8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.
template<>
__host__ __device__ f8x2_ocp_t f8_convert_rne< f8x2_ocp_t, bhalf2_t > (bhalf2_t x)
 Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.
template<>
__host__ __device__ bf8_ocp_t f8_convert_rne< bf8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.
template<>
__host__ __device__ bf8x2_ocp_t f8_convert_rne< bf8x2_ocp_t, bhalf2_t > (bhalf2_t x)
 Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.
template<>
__host__ __device__ f8_fnuz_t type_convert< f8_fnuz_t, float > (float x)
template<>
__host__ __device__ float type_convert< float, f8_fnuz_t > (f8_fnuz_t x)
template<>
__host__ __device__ float2_t type_convert< float2_t, f8x2_fnuz_t > (f8x2_fnuz_t x)
template<>
__host__ __device__ float type_convert< float, f8_ocp_t > (f8_ocp_t x)
 Converts a f8_ocp_t value to a float value.
template<>
__host__ __device__ float2_t type_convert< float2_t, f8x2_ocp_t > (f8x2_ocp_t x)
 Converts a vector of 2 f8_ocp_t values to a vector of 2 float values.
template<>
__host__ __device__ half_t type_convert< half_t, f8_ocp_t > (f8_ocp_t x)
 Converts a f8_ocp_t value to a half_t value.
template<>
__host__ __device__ half2_t type_convert< half2_t, f8x2_ocp_t > (f8x2_ocp_t x)
 Converts a vector of 2 f8_ocp_t values to a vector of 2 half_t values.
template<>
__host__ __device__ bhalf_t type_convert< bhalf_t, f8_ocp_t > (f8_ocp_t x)
 Converts a f8_ocp_t value to a bhalf_t value.
template<>
__host__ __device__ bhalf2_t type_convert< bhalf2_t, f8x2_ocp_t > (f8x2_ocp_t x)
 Converts a vector of 2 f8_ocp_t values to a vector of 2 bhalf_t values.
template<>
__host__ __device__ float type_convert< float, bf8_ocp_t > (bf8_ocp_t x)
 Converts a bf8_ocp_t value to a float value.
template<>
__host__ __device__ float2_t type_convert< float2_t, bf8x2_ocp_t > (bf8x2_ocp_t x)
 Converts a vector of 2 bf8_ocp_t values to a vector of 2 float values.
template<>
__host__ __device__ half_t type_convert< half_t, bf8_ocp_t > (bf8_ocp_t x)
 Converts a bf8_ocp_t value to a half_t value.
template<>
__host__ __device__ half2_t type_convert< half2_t, bf8x2_ocp_t > (bf8x2_ocp_t x)
 Converts a vector of 2 bf8_ocp_t values to a vector of 2 half_t values.
template<>
__host__ __device__ bhalf_t type_convert< bhalf_t, bf8_ocp_t > (bf8_ocp_t x)
 Converts a bf8_ocp_t value to a bhalf_t value.
template<>
__host__ __device__ bhalf2_t type_convert< bhalf2_t, bf8x2_ocp_t > (bf8x2_ocp_t x)
 Converts a vector of 2 bf8_ocp_t values to a vector of 2 bhalf_t values.
template<>
__host__ __device__ float2_t type_convert< float2_t, pk_i4_t > (pk_i4_t x)
template<>
__host__ __device__ half2_t type_convert< half2_t, pk_i4_t > (pk_i4_t x)
template<>
__host__ __device__ bhalf2_t type_convert< bhalf2_t, pk_i4_t > (pk_i4_t x)
template<>
__host__ __device__ half2_t type_convert< half2_t, float2_t > (float2_t x)
template<>
__host__ __device__ f8_fnuz_t type_convert< f8_fnuz_t, half_t > (half_t x)
template<>
__host__ __device__ f8_ocp_t type_convert< f8_ocp_t, half_t > (half_t x)
 Converts a half_t value to a f8_ocp_t value with rounding determined by a flag.
template<>
__host__ __device__ bf8_ocp_t type_convert< bf8_ocp_t, half_t > (half_t x)
 Converts a half_t value to a bf8_ocp_t value with rounding determined by a flag.
template<>
__host__ __device__ half_t type_convert< half_t, f8_fnuz_t > (f8_fnuz_t x)
template<>
__host__ __device__ bf8_fnuz_t type_convert< bf8_fnuz_t, float > (float x)
template<>
__host__ __device__ f8_ocp_t type_convert< f8_ocp_t, float > (float x)
 Converts a float value to a f8_ocp_t value with rounding determined by a flag.
template<>
__host__ __device__ bf8_ocp_t type_convert< bf8_ocp_t, float > (float x)
 Converts a float value to a bf8_ocp_t value with rounding determined by a flag.
template<>
__host__ __device__ f8_ocp_t type_convert< f8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t value to a f8_ocp_t value with rounding determined by a flag.
template<>
__host__ __device__ bf8_ocp_t type_convert< bf8_ocp_t, bhalf_t > (bhalf_t x)
 Converts a bhalf_t value to a bf8_ocp_t value with rounding determined by a flag.
template<>
__host__ __device__ float type_convert< float, bf8_fnuz_t > (bf8_fnuz_t x)
template<>
__host__ __device__ bf8_fnuz_t type_convert< bf8_fnuz_t, half_t > (half_t x)
template<>
__host__ __device__ half_t type_convert< half_t, bf8_fnuz_t > (bf8_fnuz_t x)
__host__ __device__ f4_t f4_convert_rne (float x, float scale=1.0f)
__host__ __device__ f4x2_t f4_convert_rne (float2_t x, float scale=1.0f)
__host__ __device__ f4_t f4_convert_sr (float x, float scale=1.0f)
__host__ __device__ f4x2_t f4_convert_sr (float2_t x, float scale=1.0f)
template<>
__host__ __device__ f4_t type_convert< f4_t, float > (float x)
template<>
__host__ __device__ f4x2_t type_convert< f4x2_t, float2_t > (float2_t x)
template<>
__host__ __device__ f4x2_pk_t type_convert< f4x2_pk_t, float2_t > (float2_t x)
template<>
__host__ __device__ f4x32_t type_convert< f4x32_t, float32_t > (float32_t x)
template<>
__host__ __device__ float type_convert< float, f4_t > (f4_t x)
template<>
__host__ __device__ float2_t type_convert< float2_t, f4x2_t > (f4x2_t x)
template<>
__host__ __device__ float32_t type_convert< float32_t, f4x32_t > (f4x32_t x)
__host__ __device__ f6_t f6_convert_rne (float x, float scale=1.0f)
 Converts a float to a 6-bit float type (f6_t) using round-to-nearest-even.
__host__ __device__ f6x32_t f6_convert_rne (float32_t x, float scale=1.0f)
 Converts a 32-element single-precision float array into a packed 6-bit representation.
__host__ __device__ f6_t f6_convert_sr (float x, float scale=1.0f)
 Converts a float to the 6-bit floating-point type (f6_t) using stochastic rounding.
__host__ __device__ f6x32_t f6_convert_sr (float32_t x, float scale=1.0f)
 Converts a 32-element single-precision float array into a packed 6-bit representation.
template<>
__host__ __device__ f6_t type_convert< f6_t, float > (float x)
 Specializes the type conversion template for converting a float into the 6-bit float type (f6_t).
template<>
__host__ __device__ f6x32_t type_convert< f6x32_t, float32_t > (float32_t x)
 Specializes the type conversion template for converting a vector of 32 floats into the vector of 32 6-bit float types (f6x32_t).
template<>
__host__ __device__ f6x32_pk_t type_convert< f6x32_pk_t, float32_t > (float32_t x)
template<>
__host__ __device__ f6x16_t type_convert< f6x16_t, float16_t > (float16_t x)
template<>
__host__ __device__ f6x16_pk_t type_convert< f6x16_pk_t, float16_t > (float16_t x)
template<>
__host__ __device__ float type_convert< float, f6_t > (f6_t x)
 Specializes the type conversion template for converting the 6-bit float type (f6_t) to float.
template<>
__host__ __device__ float32_t type_convert< float32_t, f6x32_t > (f6x32_t x)
 Specializes the type conversion template for converting the vector of 32 6-bit float types (f6x32_t) to vector of 32 floats.
template<>
__host__ __device__ float16_t type_convert< float16_t, f6x16_t > (f6x16_t x)
template<>
__host__ __device__ float16_t type_convert< float16_t, f6x16_pk_t > (f6x16_pk_t x)
__host__ __device__ bf6_t bf6_convert_rne (float x, float scale=1.0f)
 Converts a float to the 6-bit BF6 type using round-to-nearest-even.
__host__ __device__ bf6x32_t bf6_convert_rne (float32_t x, float scale=1.0f)
 Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using round-to-nearest-even.
__host__ __device__ bf6_t bf6_convert_sr (float x, float scale=1.0f)
 Converts a float to the 6-bit BF6 type using stochastic rounding.
__host__ __device__ bf6x32_t bf6_convert_sr (float32_t x, float scale=1.0f)
 Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using stochastic rounding.
template<>
__host__ __device__ bf6_t type_convert< bf6_t, float > (float x)
 Specializes float-to-bf6_t conversion.
template<>
__host__ __device__ bf6x32_t type_convert< bf6x32_t, float32_t > (float32_t x)
 Specializes vector of 32 float-to-bf6_t conversion.
template<>
__host__ __device__ bf6x32_pk_t type_convert< bf6x32_pk_t, float32_t > (float32_t x)
template<>
__host__ __device__ bf6x16_t type_convert< bf6x16_t, float16_t > (float16_t x)
template<>
__host__ __device__ bf6x16_pk_t type_convert< bf6x16_pk_t, float16_t > (float16_t x)
template<>
__host__ __device__ float type_convert< float, bf6_t > (bf6_t x)
 Specializes the type conversion template for converting a bf6_t value to float.
template<>
__host__ __device__ float32_t type_convert< float32_t, bf6x32_t > (bf6x32_t x)
 Specializes the type conversion template for converting a vector of 32 bf6_t values to vector of 32 floats.
template<>
__host__ __device__ float16_t type_convert< float16_t, bf6x16_t > (bf6x16_t x)
template<>
__host__ __device__ float16_t type_convert< float16_t, bf6x16_pk_t > (bf6x16_pk_t x)
template<typename Y, typename X, size_t NumElems>
__host__ __device__ void array_convert (std::array< Y, NumElems > &y, const std::array< X, NumElems > &x)
template<typename Y, typename X, index_t NumElems>
__host__ __device__ void array_convert (Array< Y, NumElems > &y, const Array< X, NumElems > &x)

Variables

template<typename T>
constexpr index_t packed_size_v = packed_type_info<T>::packed_size
template<typename T>
constexpr bool is_packed_type_v = packed_size_v<T> > 1
constexpr detail::ignore_t ignore
template<typename X, typename Y>
constexpr bool is_same_v = is_same<X, Y>::value
template<typename X, typename Y>
constexpr bool is_base_of_v = is_base_of<X, Y>::value
template<typename T>
constexpr bool is_unsigned_v = is_unsigned<T>::value
template<typename T>
constexpr bool is_pointer_v = is_pointer<T>::value

Detailed Description

Definitions from <cstdint>, <cmath> conflict with /opt/rocm/include/hip/amd_detail/amd_hip_vector_types.h.

Typedef Documentation

◆ bf6_t

using ck::bf6_t = unsigned _BitInt(6)

◆ bf6x16_pk_t

◆ bf6x16_t

using ck::bf6x16_t = typename vector_type<bf6x16_pk_t, 1>::type

◆ bf6x16x2_t

using ck::bf6x16x2_t = typename vector_type<bf6x16_pk_t, 2>::type

◆ bf6x32_pk_t

◆ bf6x32_t

using ck::bf6x32_t = typename vector_type<bf6x32_pk_t, 1>::type

◆ bf8_t

◆ bf8x16_fnuz_t

using ck::bf8x16_fnuz_t = typename vector_type<bf8_fnuz_t, 16>::type

◆ bf8x16_ocp_t

using ck::bf8x16_ocp_t = typename vector_type<bf8_ocp_t, 16>::type

◆ bf8x2_fnuz_t

using ck::bf8x2_fnuz_t = typename vector_type<bf8_fnuz_t, 2>::type

◆ bf8x2_ocp_t

using ck::bf8x2_ocp_t = typename vector_type<bf8_ocp_t, 2>::type

◆ bf8x32_fnuz_t

using ck::bf8x32_fnuz_t = typename vector_type<bf8_fnuz_t, 32>::type

◆ bf8x32_ocp_t

using ck::bf8x32_ocp_t = typename vector_type<bf8_ocp_t, 32>::type

◆ bf8x4_fnuz_t

using ck::bf8x4_fnuz_t = typename vector_type<bf8_fnuz_t, 4>::type

◆ bf8x4_ocp_t

using ck::bf8x4_ocp_t = typename vector_type<bf8_ocp_t, 4>::type

◆ bf8x64_fnuz_t

using ck::bf8x64_fnuz_t = typename vector_type<bf8_fnuz_t, 64>::type

◆ bf8x64_ocp_t

using ck::bf8x64_ocp_t = typename vector_type<bf8_ocp_t, 64>::type

◆ bf8x8_fnuz_t

using ck::bf8x8_fnuz_t = typename vector_type<bf8_fnuz_t, 8>::type

◆ bf8x8_ocp_t

using ck::bf8x8_ocp_t = typename vector_type<bf8_ocp_t, 8>::type

◆ bhalf16_t

using ck::bhalf16_t = typename vector_type<bhalf_t, 16>::type

◆ bhalf2_t

using ck::bhalf2_t = typename vector_type<bhalf_t, 2>::type

◆ bhalf32_t

using ck::bhalf32_t = typename vector_type<bhalf_t, 32>::type

◆ bhalf4_t

using ck::bhalf4_t = typename vector_type<bhalf_t, 4>::type

◆ bhalf8_t

using ck::bhalf8_t = typename vector_type<bhalf_t, 8>::type

◆ bhalf_t

using ck::bhalf_t = ushort

◆ bool_constant

template<bool B>
using ck::bool_constant = integral_constant<bool, B>

◆ conditional_t

template<bool predicate, class X, class Y>
using ck::conditional_t = typename conditional<predicate, X, Y>::type

◆ double2_t

using ck::double2_t = typename vector_type<double, 2>::type

◆ double4_t

using ck::double4_t = typename vector_type<double, 4>::type

◆ e8m0x4_bexp_t

using ck::e8m0x4_bexp_t = typename vector_type<e8m0_bexp_t, 4>::type

◆ element_type_t

template<typename T>
using ck::element_type_t = typename packed_type_info<T>::element_type

◆ enable_if

template<bool B, typename T = void>
using ck::enable_if = std::enable_if<B, T>

◆ enable_if_t

template<bool B, typename T = void>
using ck::enable_if_t = typename std::enable_if<B, T>::type

◆ f4_t

using ck::f4_t = unsigned _BitInt(4)

◆ f4x16_t

using ck::f4x16_t = typename vector_type<f4x2_pk_t, 8>::type

◆ f4x2_t

using ck::f4x2_t = typename vector_type<f4x2_pk_t, 1>::type

◆ f4x32_t

using ck::f4x32_t = typename vector_type<f4x2_pk_t, 16>::type

◆ f4x4_t

using ck::f4x4_t = typename vector_type<f4x2_pk_t, 2>::type

◆ f4x64_t

using ck::f4x64_t = typename vector_type<f4x2_pk_t, 32>::type

◆ f4x8_t

using ck::f4x8_t = typename vector_type<f4x2_pk_t, 4>::type

◆ f6_t

using ck::f6_t = _BitInt(6)

◆ f6x16_pk_t

using ck::f6x16_pk_t = f6_pk_t<f6_t, 16>

◆ f6x16_t

using ck::f6x16_t = typename vector_type<f6x16_pk_t, 1>::type

◆ f6x16x2_t

using ck::f6x16x2_t = typename vector_type<f6x16_pk_t, 2>::type

◆ f6x32_pk_t

using ck::f6x32_pk_t = f6_pk_t<f6_t, 32>

◆ f6x32_t

using ck::f6x32_t = typename vector_type<f6x32_pk_t, 1>::type

◆ f8_t

◆ f8x16_fnuz_t

using ck::f8x16_fnuz_t = typename vector_type<f8_fnuz_t, 16>::type

◆ f8x16_ocp_t

using ck::f8x16_ocp_t = typename vector_type<f8_ocp_t, 16>::type

◆ f8x2_fnuz_t

using ck::f8x2_fnuz_t = typename vector_type<f8_fnuz_t, 2>::type

◆ f8x2_ocp_t

using ck::f8x2_ocp_t = typename vector_type<f8_ocp_t, 2>::type

◆ f8x32_fnuz_t

using ck::f8x32_fnuz_t = typename vector_type<f8_fnuz_t, 32>::type

◆ f8x32_ocp_t

using ck::f8x32_ocp_t = typename vector_type<f8_ocp_t, 32>::type

◆ f8x4_fnuz_t

using ck::f8x4_fnuz_t = typename vector_type<f8_fnuz_t, 4>::type

◆ f8x4_ocp_t

using ck::f8x4_ocp_t = typename vector_type<f8_ocp_t, 4>::type

◆ f8x64_fnuz_t

using ck::f8x64_fnuz_t = typename vector_type<f8_fnuz_t, 64>::type

◆ f8x64_ocp_t

using ck::f8x64_ocp_t = typename vector_type<f8_ocp_t, 64>::type

◆ f8x8_fnuz_t

using ck::f8x8_fnuz_t = typename vector_type<f8_fnuz_t, 8>::type

◆ f8x8_ocp_t

using ck::f8x8_ocp_t = typename vector_type<f8_ocp_t, 8>::type

◆ false_type

using ck::false_type = bool_constant<false>

◆ float16_t

using ck::float16_t = typename vector_type<float, 16>::type

◆ float2_t

using ck::float2_t = typename vector_type<float, 2>::type

◆ float32_t

using ck::float32_t = typename vector_type<float, 32>::type

◆ float4_t

using ck::float4_t = typename vector_type<float, 4>::type

◆ float64_t

using ck::float64_t = typename vector_type<float, 64>::type

◆ float8_t

using ck::float8_t = typename vector_type<float, 8>::type

◆ fp8_storage_t

typedef unsigned char ck::fp8_storage_t

◆ half16_t

using ck::half16_t = typename vector_type<half_t, 16>::type

◆ half2_t

using ck::half2_t = typename vector_type<half_t, 2>::type

◆ half32_t

using ck::half32_t = typename vector_type<half_t, 32>::type

◆ half4_t

using ck::half4_t = typename vector_type<half_t, 4>::type

◆ half8_t

using ck::half8_t = typename vector_type<half_t, 8>::type

◆ half_t

using ck::half_t = _Float16

◆ has_same_scalar_type

template<typename X, typename Y>
using ck::has_same_scalar_type
Initial value:
Definition type.hpp:177
Definition data_type.hpp:39

◆ index_t

◆ int32x16_t

using ck::int32x16_t = typename vector_type<int32_t, 16>::type

◆ int32x2_t

using ck::int32x2_t = typename vector_type<int32_t, 2>::type

◆ int32x32_t

using ck::int32x32_t = typename vector_type<int32_t, 32>::type

◆ int32x4_t

using ck::int32x4_t = typename vector_type<int32_t, 4>::type

◆ int32x64_t

using ck::int32x64_t = typename vector_type<int32_t, 64>::type

◆ int32x6_t

using ck::int32x6_t = typename vector_type<int32_t, 6>::type

◆ int32x8_t

using ck::int32x8_t = typename vector_type<int32_t, 8>::type

◆ int4_t

using ck::int4_t = _BitInt(4)

◆ int64_t

using ck::int64_t = long

◆ int8x16_t

using ck::int8x16_t = typename vector_type<int8_t, 16>::type

◆ int8x2_t

using ck::int8x2_t = typename vector_type<int8_t, 2>::type

◆ int8x32_t

using ck::int8x32_t = typename vector_type<int8_t, 32>::type

◆ int8x4_t

using ck::int8x4_t = typename vector_type<int8_t, 4>::type

◆ int8x64_t

using ck::int8x64_t = typename vector_type<int8_t, 64>::type

◆ int8x8_t

using ck::int8x8_t = typename vector_type<int8_t, 8>::type

◆ is_detected

template<template< class... > class Op, class... Args>
using ck::is_detected = typename detail::detector<nonesuch, void, Op, Args...>::value_t

◆ is_pack2_invocable_t

template<typename T>
using ck::is_pack2_invocable_t = decltype(ck::declval<T&>().is_pack2_invocable)

◆ is_pack4_invocable_t

template<typename T>
using ck::is_pack4_invocable_t = decltype(ck::declval<T&>().is_pack4_invocable)

◆ is_pack8_invocable_t

template<typename T>
using ck::is_pack8_invocable_t = decltype(ck::declval<T&>().is_pack8_invocable)

◆ is_tuple

template<typename T>
using ck::is_tuple = decltype(ck::declval<T&>().IsTuple())

◆ iter_difference_t

template<typename T>
using ck::iter_difference_t = typename std::iterator_traits<remove_cvref_t<T>>::difference_type

◆ iter_reference_t

template<typename T>
using ck::iter_reference_t = decltype(*std::declval<T&>())

◆ iter_value_t

template<typename T>
using ck::iter_value_t = typename std::iterator_traits<remove_cvref_t<T>>::value_type

◆ long_index_t

◆ LongNumber

◆ make_index_sequence

template<index_t N>
using ck::make_index_sequence
Initial value:
typename __make_integer_seq<impl::__integer_sequence, index_t, N>::seq_type

◆ MultiIndex

template<index_t N>
using ck::MultiIndex = Array<index_t, N>

◆ Number

template<index_t N>
using ck::Number = integral_constant<index_t, N>

◆ packed_type_t

template<typename T, index_t N = 0>
using ck::packed_type_t = typename packed_type_maker<T, N>::packed_type

◆ pk_i4x2_t

using ck::pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type

◆ pk_i4x4_t

using ck::pk_i4x4_t = typename vector_type<pk_i4_t, 4>::type

◆ pk_i4x8_t

using ck::pk_i4x8_t = typename vector_type<pk_i4_t, 8>::type

◆ remove_cv_t

template<typename T>
using ck::remove_cv_t = typename remove_cv<T>::type

◆ remove_cvref_t

template<typename T>
using ck::remove_cvref_t = remove_cv_t<remove_reference_t<T>>

◆ remove_pointer_t

template<typename T>
using ck::remove_pointer_t = typename remove_pointer<T>::type

◆ remove_reference_t

template<typename T>
using ck::remove_reference_t = typename remove_reference<T>::type

◆ sequence_merge_t

template<typename Sx, typename Sy>
using ck::sequence_merge_t = typename sequence_merge<Sx, Sy>::type

◆ StaticallyIndexedArray

template<typename T, index_t N>
using ck::StaticallyIndexedArray = typename detail::StaticallyIndexedArrayImpl<T, N>::type

◆ TensorCoordinate_t

template<typename TensorDesc>
using ck::TensorCoordinate_t
Initial value:
__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition tensor_description/tensor_descriptor.hpp:407
Array< index_t, N > MultiIndex
Definition array_multi_index.hpp:12

◆ TensorCoordinateStep_t

template<typename TensorDesc>
using ck::TensorCoordinateStep_t
Initial value:
__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition tensor_description/tensor_descriptor.hpp:444

◆ tf32_t

using ck::tf32_t = _BitInt(19)

◆ true_type

◆ tuple_element_t

template<index_t I, typename TTuple>
using ck::tuple_element_t = typename tuple_element<I, TTuple>::type

◆ uint8x16_t

using ck::uint8x16_t = typename vector_type<uint8_t, 16>::type

◆ uint8x2_t

using ck::uint8x2_t = typename vector_type<uint8_t, 2>::type

◆ uint8x32_t

using ck::uint8x32_t = typename vector_type<uint8_t, 32>::type

◆ uint8x4_t

using ck::uint8x4_t = typename vector_type<uint8_t, 4>::type

◆ uint8x64_t

using ck::uint8x64_t = typename vector_type<uint8_t, 64>::type

◆ uint8x8_t

using ck::uint8x8_t = typename vector_type<uint8_t, 8>::type

◆ uniform_sequence_gen_t

template<index_t NSize, index_t I>
using ck::uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type

◆ vector_type_maker_t

template<typename T, index_t N>
using ck::vector_type_maker_t = typename vector_type_maker<T, N>::type

Enumeration Type Documentation

◆ Activation [1/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ Activation [2/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ Activation [3/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ Activation [4/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ Activation [5/5]

Enumerator
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 
gelu_and_mul 
silu_and_mul 

◆ AddressSpaceEnum

enum struct ck::AddressSpaceEnum
strong
Enumerator
Generic 
Global 
Lds 
Sgpr 
Vgpr 

◆ AmdBufferCoherenceEnum [1/2]

enum struct ck::AmdBufferCoherenceEnum
strong
Enumerator
DefaultCoherence 
GLC 
SLC 
GLC_SLC 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 
DefaultCoherence 
GLC 
SLC 
GLC_SLC 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 

◆ AmdBufferCoherenceEnum [2/2]

enum struct ck::AmdBufferCoherenceEnum
strong
Enumerator
DefaultCoherence 
GLC 
SLC 
GLC_SLC 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 
DefaultCoherence 
GLC 
SLC 
GLC_SLC 
WAVE_NT0 
WAVE_NT1 
GROUP_NT0 
GROUP_NT1 
DEVICE_NT0 
DEVICE_NT1 
SYSTEM_NT0 
SYSTEM_NT1 

◆ BlockGemmPipelineScheduler

enum struct ck::BlockGemmPipelineScheduler
strong
Enumerator
Intrawave 
Interwave 

◆ BlockGemmPipelineVersion

enum struct ck::BlockGemmPipelineVersion
strong
Enumerator
v1 
v2 
v3 
v4 
v5 

◆ ck_fp8_interpretation_t

enum class ck::ck_fp8_interpretation_t
strong

Describes FP8 interpretation.

Enumerator
CK_E4M3_OCP 
CK_E5M2_OCP 
CK_E4M3_FNUZ 
CK_E5M2_FNUZ 

◆ ck_saturation_t

enum class ck::ck_saturation_t
strong

Describes saturation behavior.

Enumerator
CK_NOSAT 
CK_SATFINITE 

◆ DppInstr

enum struct ck::DppInstr
strong
Enumerator
dpp8_f16_1x32x2 
dpp8_f16_2x16x2 
dpp8_f16_2x32x2 
dpp8_f16_4x16x2 
dpp8_f16_4x32x2 
dpp8_f16_8x16x2 
dpp8_f16_8x32x2 
dpp8_f16_16x16x2 
dpp8_f16_32x8x2 

◆ f8_rounding_mode

enum class ck::f8_rounding_mode
strong
Enumerator
standard 
stochastic 

◆ IndicesType

enum struct ck::IndicesType
strong
Enumerator
INDICES_32BIT 
INDICES_64BIT 
INDICES_16BIT 
INDICES_8BIT 

◆ InMemoryDataOperationEnum

enum struct ck::InMemoryDataOperationEnum
strong
Enumerator
Set 
AtomicAdd 
AtomicMax 
Add 

◆ LoopScheduler

enum struct ck::LoopScheduler
strong
Enumerator
Default 
Interwave 

◆ MfmaInstr

enum struct ck::MfmaInstr
strong
Enumerator
mfma_f32_32x32x1f32 
mfma_f32_16x16x1f32 
mfma_f32_4x4x1f32 
mfma_f32_32x32x2f32 
mfma_f32_16x16x4f32 
mfma_f32_32x32x4f16 
mfma_f32_16x16x4f16 
mfma_f32_4x4x4f16 
mfma_f32_32x32x8f16 
mfma_f32_16x16x16f16 
mfma_f32_32x32x8bf16_1k 
mfma_f32_16x16x16bf16_1k 
mfma_f32_32x32x4bf16 
mfma_f32_16x16x8bf16 
mfma_i32_32x32x8i8 
mfma_i32_16x16x16i8 
mfma_i32_32x32x16i8 
mfma_i32_16x16x32i8 
mfma_f64_16x16x4f64 
mfma_f32_32x32x16f8f8 
mfma_f32_16x16x32f8f8 
mfma_f32_32x32x16bf8bf8 
mfma_f32_16x16x32bf8bf8 
mfma_f32_32x32x16f8bf8 
mfma_f32_16x16x32f8bf8 
mfma_f32_32x32x16bf8f8 
mfma_f32_16x16x32bf8f8 
mfma_f32_32x32x16f16 
mfma_f32_16x16x32f16 
mfma_f32_32x32x16bf16 
mfma_f32_16x16x32bf16 
mfma_i32_32x32x32i8 
mfma_i32_16x16x64i8 
mfma_f32_32x32x64f8f6f4 
mfma_f32_16x16x128f8f6f4 
mfma_scale_f32_32x32x64f8f6f4 
mfma_scale_f32_16x16x128f8f6f4 
mfma_f32_16x16x8xf32 
mfma_f32_32x32x4xf32 
wmma_f32_16x16x16_f16 
wmma_f32_16x16x16_bf16 
wmma_i32_16x16x16_iu8 
wmma_unsupport_16x16_gfx11 
wmma_f32_16x16x16_f16_gfx12 
wmma_f32_16x16x16_bf16_gfx12 
wmma_i32_16x16x16_iu8_gfx12 
wmma_f32_16x16x16_f8f8_gfx12 
wmma_f32_16x16x16_f8bf8_gfx12 
wmma_f32_16x16x16_bf8f8_gfx12 
wmma_f32_16x16x16_bf8bf8_gfx12 
wmma_unsupport_16x16_gfx12 

◆ NanPropagation

enum struct ck::NanPropagation
strong
Enumerator
NOT_PROPAGATE_NAN 
PROPAGATE_NAN 

◆ PipelineVersion

enum struct ck::PipelineVersion
strong
Enumerator
v1 
v2 
v4 
weight_only 

◆ ReduceTensorIndices

enum struct ck::ReduceTensorIndices
strong
Enumerator
NO_INDICES 
FLATTENED_INDICES 

◆ ReduceTensorOp

enum struct ck::ReduceTensorOp
strong
Enumerator
ADD 
MUL 
MIN 
MAX 
AMAX 
AVG 
NORM1 
NORM2 

◆ SchedulerGroup

Enumerator
SCHED_GROUP_MFMA 
SCHED_GROUP_VMEM 
SCHED_GROUP_LDS_READ 
SCHED_GROUP_LDS_WRITE 

◆ SmfmacInstr

enum struct ck::SmfmacInstr
strong
Enumerator
smfmac_f32_16x16x32f16 
smfmac_f32_32x32x16f16 
smfmac_f32_16x16x32bf16 
smfmac_f32_32x32x16bf16 

◆ StreamKReductionStrategy

Enumerator
Atomic 
Reduction 

◆ TailNumber

enum struct ck::TailNumber
strong
Enumerator
Odd 
Even 
One 
Two 
Three 
Four 
Five 
Six 
Seven 
Empty 
Full 

◆ WmmaInstr

enum struct ck::WmmaInstr
strong
Enumerator
wmma_f32_16x16x16_f16 
wmma_f32_16x16x16_bf16 
wmma_f16_16x16x16_f16 
wmma_bf16_16x16x16_bf16 
wmma_i32_16x16x16_iu8 
wmma_i32_16x16x16_iu4 
wmma_f32_16x16x16_f16_gfx12 
wmma_f32_16x16x16_bf16_gfx12 
wmma_i32_16x16x16_iu8_gfx12 
wmma_f32_16x16x16_f8f8_gfx12 
wmma_f32_16x16x16_f8bf8_gfx12 
wmma_f32_16x16x16_bf8f8_gfx12 
wmma_f32_16x16x16_bf8bf8_gfx12 

Function Documentation

◆ accumulate_n()

template<typename T, typename ForwardIterator, typename Size, typename BinaryOperation>
auto ck::accumulate_n ( ForwardIterator first,
Size count,
T init,
BinaryOperation op )->decltype(std::accumulate(first, std::next(first, count), init, op))

◆ amd_assemble_cvt_f32_i4()

__device__ float ck::amd_assemble_cvt_f32_i4 ( int b)
inline

◆ amd_assembly_and_b32()

__device__ int ck::amd_assembly_and_b32 ( int a,
int b )
inline

◆ amd_assembly_and_or_b32()

__device__ int ck::amd_assembly_and_or_b32 ( int a,
int b,
int d )
inline

◆ amd_assembly_cvt_f8_to_f32()

__device__ f8x4_t ck::amd_assembly_cvt_f8_to_f32 ( float b0,
float b1,
float b2,
float b3 )
inline

◆ amd_assembly_i4_to_fp8x8()

__device__ f8x8_t ck::amd_assembly_i4_to_fp8x8 ( int a)
inline

◆ amd_assembly_outer_product_1x2() [1/3]

__device__ void ck::amd_assembly_outer_product_1x2 ( float a,
float b0,
float b1,
float & c0,
float & c1 )

◆ amd_assembly_outer_product_1x2() [2/3]

__device__ void ck::amd_assembly_outer_product_1x2 ( half2_t a,
half2_t b0,
half2_t b1,
float & c0,
float & c1 )

◆ amd_assembly_outer_product_1x2() [3/3]

__device__ void ck::amd_assembly_outer_product_1x2 ( int8x4_t a,
int8x4_t b0,
int8x4_t b1,
int32_t & c0,
int32_t & c1 )

◆ amd_assembly_outer_product_1x4() [1/3]

__device__ void ck::amd_assembly_outer_product_1x4 ( float a,
float b0,
float b1,
float b2,
float b3,
float & c0,
float & c1,
float & c2,
float & c3 )

◆ amd_assembly_outer_product_1x4() [2/3]

__device__ void ck::amd_assembly_outer_product_1x4 ( half2_t a,
half2_t b0,
half2_t b1,
half2_t b2,
half2_t b3,
float & c0,
float & c1,
float & c2,
float & c3 )

◆ amd_assembly_outer_product_1x4() [3/3]

__device__ void ck::amd_assembly_outer_product_1x4 ( int8x4_t a,
int8x4_t b0,
int8x4_t b1,
int8x4_t b2,
int8x4_t b3,
int32_t & c0,
int32_t & c1,
int32_t & c2,
int32_t & c3 )

◆ amd_assembly_pk_add_f16()

__device__ half2_t ck::amd_assembly_pk_add_f16 ( half2_t a,
half2_t b )
inline

◆ amd_assembly_pk_fma_f16()

__device__ half2_t ck::amd_assembly_pk_fma_f16 ( half2_t a,
half2_t b,
half2_t c )
inline

◆ amd_buffer_atomic_add()

template<typename T, index_t N>
__device__ void ck::amd_buffer_atomic_add ( const typename vector_type_maker< T, N >::type::type src_thread_data,
T * p_dst_wave,
const index_t dst_thread_element_offset,
const bool dst_thread_element_valid,
const index_t dst_element_space_size )

◆ amd_buffer_atomic_add_impl()

template<typename T, index_t N>
__device__ void ck::amd_buffer_atomic_add_impl ( const typename vector_type< T, N >::type src_thread_data,
int32x4_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset )

◆ amd_buffer_atomic_max()

template<typename T, index_t N>
__device__ void ck::amd_buffer_atomic_max ( const typename vector_type_maker< T, N >::type::type src_thread_data,
T * p_dst_wave,
const index_t dst_thread_element_offset,
const bool dst_thread_element_valid,
const index_t dst_element_space_size )

◆ amd_buffer_atomic_max_impl()

template<typename T, index_t N>
__device__ void ck::amd_buffer_atomic_max_impl ( const typename vector_type< T, N >::type src_thread_data,
int32x4_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset )

◆ amd_buffer_load_impl() [1/2]

template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< T, N >::type ck::amd_buffer_load_impl ( __amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_addr_offset,
index_t src_wave_addr_offset )

◆ amd_buffer_load_impl() [2/2]

template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< T, N >::type ck::amd_buffer_load_impl ( int32x4_t src_wave_buffer_resource,
index_t src_thread_addr_offset,
index_t src_wave_addr_offset )

◆ amd_buffer_load_impl_raw() [1/2]

template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< int8_t, N >::type ck::amd_buffer_load_impl_raw ( __amdgpu_buffer_rsrc_t src_wave_buffer_resource,
index_t src_thread_addr_offset,
index_t src_wave_addr_offset )

◆ amd_buffer_load_impl_raw() [2/2]

template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type< int8_t, N >::type ck::amd_buffer_load_impl_raw ( int32x4_t src_wave_buffer_resource,
index_t src_thread_addr_offset,
index_t src_wave_addr_offset )

◆ amd_buffer_load_invalid_element_return_customized_value()

template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type ck::amd_buffer_load_invalid_element_return_customized_value ( const T * p_src_wave,
index_t src_thread_element_offset,
bool src_thread_element_valid,
index_t src_element_space_size,
T customized_value )

◆ amd_buffer_load_invalid_element_return_zero()

template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ vector_type_maker< T, N >::type::type ck::amd_buffer_load_invalid_element_return_zero ( const T * p_src_wave,
index_t src_thread_element_offset,
bool src_thread_element_valid,
index_t src_element_space_size )

◆ amd_buffer_store()

template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store ( const typename vector_type_maker< T, N >::type::type src_thread_data,
T * p_dst_wave,
const index_t dst_thread_element_offset,
const bool dst_thread_element_valid,
const index_t dst_element_space_size )

◆ amd_buffer_store_impl() [1/2]

template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl ( const typename vector_type< T, N >::type src_thread_data,
__amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset )

◆ amd_buffer_store_impl() [2/2]

template<typename T, index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl ( const typename vector_type< T, N >::type src_thread_data,
int32x4_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset )

◆ amd_buffer_store_impl_raw() [1/2]

template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl_raw ( const typename vector_type< int8_t, N >::type src_thread_data,
__amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset )

◆ amd_buffer_store_impl_raw() [2/2]

template<index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
__device__ void ck::amd_buffer_store_impl_raw ( const typename vector_type< int8_t, N >::type src_thread_data,
int32x4_t dst_wave_buffer_resource,
index_t dst_thread_addr_offset,
index_t dst_wave_addr_offset )

◆ amd_direct_load_global_to_lds()

template<typename T, index_t NumElemsPerThread>
__device__ void ck::amd_direct_load_global_to_lds ( const T * global_base_ptr,
const index_t global_offset,
T * lds_base_ptr,
const index_t lds_offset,
const bool is_valid,
const index_t src_element_space_size )

◆ amd_global_atomic_add_impl()

template<typename T, index_t N>
__device__ void ck::amd_global_atomic_add_impl ( const typename vector_type< T, N >::type src_thread_data,
T * addr )

◆ amd_wave_read_first_lane() [1/4]

template<typename Object, typename = ck::enable_if_t<ck::is_class_v<Object> && ck::is_trivially_copyable_v<Object>>>
__device__ auto ck::amd_wave_read_first_lane ( const Object & obj)

NOTE: Implicitly start object lifetime. It's better to use std::start_lifetime_at() in this scenario

◆ amd_wave_read_first_lane() [2/4]

__device__ int32_t ck::amd_wave_read_first_lane ( int32_t value)
inline

◆ amd_wave_read_first_lane() [3/4]

__device__ int64_t ck::amd_wave_read_first_lane ( int64_t value)
inline

◆ amd_wave_read_first_lane() [4/4]

__device__ uint32_t ck::amd_wave_read_first_lane ( uint32_t value)
inline

◆ array_convert() [1/2]

template<typename Y, typename X, index_t NumElems>
__host__ __device__ void ck::array_convert ( Array< Y, NumElems > & y,
const Array< X, NumElems > & x )
inline

◆ array_convert() [2/2]

template<typename Y, typename X, size_t NumElems>
__host__ __device__ void ck::array_convert ( std::array< Y, NumElems > & y,
const std::array< X, NumElems > & x )
inline

◆ atomic_add()

template<typename X>
__device__ X ck::atomic_add ( X * p_dst,
const X & x )

◆ atomic_add< _Float16 >()

template<>
__device__ _Float16 ck::atomic_add< _Float16 > ( _Float16 * p_dst,
const _Float16 & x )

◆ atomic_add< double >()

template<>
__device__ double ck::atomic_add< double > ( double * p_dst,
const double & x )

◆ atomic_add< double2_t >()

template<>
__device__ double2_t ck::atomic_add< double2_t > ( double2_t * p_dst,
const double2_t & x )

◆ atomic_add< float >()

template<>
__device__ float ck::atomic_add< float > ( float * p_dst,
const float & x )

◆ atomic_add< float2_t >()

template<>
__device__ float2_t ck::atomic_add< float2_t > ( float2_t * p_dst,
const float2_t & x )

◆ atomic_add< int32_t >()

template<>
__device__ int32_t ck::atomic_add< int32_t > ( int32_t * p_dst,
const int32_t & x )

◆ atomic_add< uint32_t >()

template<>
__device__ uint32_t ck::atomic_add< uint32_t > ( uint32_t * p_dst,
const uint32_t & x )

◆ atomic_add< unsigned short >()

template<>
__device__ unsigned short ck::atomic_add< unsigned short > ( unsigned short * p_dst,
const unsigned short & x )

◆ atomic_max()

template<typename X>
__device__ X ck::atomic_max ( X * p_dst,
const X & x )

◆ atomic_max< double >()

template<>
__device__ double ck::atomic_max< double > ( double * p_dst,
const double & x )

◆ atomic_max< float >()

template<>
__device__ float ck::atomic_max< float > ( float * p_dst,
const float & x )

◆ atomic_max< float2_t >()

template<>
__device__ float2_t ck::atomic_max< float2_t > ( float2_t * p_dst,
const float2_t & x )

◆ atomic_max< int32_t >()

template<>
__device__ int32_t ck::atomic_max< int32_t > ( int32_t * p_dst,
const int32_t & x )

◆ atomic_max< uint32_t >()

template<>
__device__ uint32_t ck::atomic_max< uint32_t > ( uint32_t * p_dst,
const uint32_t & x )

◆ bf16_convert_rtn()

template<typename Y, typename X>
__host__ __device__ constexpr Y ck::bf16_convert_rtn ( X x)
constexpr

◆ bf16_convert_rtn< bhalf_t, float >()

template<>
__host__ __device__ constexpr bhalf_t ck::bf16_convert_rtn< bhalf_t, float > ( float x)
inlineconstexpr

◆ bf16_convert_rtn< bhalf_t, half_t >()

template<>
__host__ __device__ constexpr bhalf_t ck::bf16_convert_rtn< bhalf_t, half_t > ( half_t x)
inlineconstexpr

◆ bf6_convert_rne() [1/2]

__host__ __device__ bf6_t ck::bf6_convert_rne ( float x,
float scale = 1.0f )
inline

Converts a float to the 6-bit BF6 type using round-to-nearest-even.

Divides the input by the specified scale, then saturates and converts it to a 6-bit BF6 floating-point format.

Parameters
xThe float value to be converted.
scaleThe scaling factor applied to the input before conversion.
Returns
The converted bf6_t value.

◆ bf6_convert_rne() [2/2]

__host__ __device__ bf6x32_t ck::bf6_convert_rne ( float32_t x,
float scale = 1.0f )
inline

Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using round-to-nearest-even.

Divides the input by the specified scale, then saturates and converts it to a 6-bit BF6 floating-point format.

Parameters
xThe float vector to be converted.
scaleThe scaling factor applied to the input before conversion.
Returns
The converted bf6x32_t vector.

◆ bf6_convert_sr() [1/2]

__host__ __device__ bf6_t ck::bf6_convert_sr ( float x,
float scale = 1.0f )
inline

Converts a float to the 6-bit BF6 type using stochastic rounding.

Divides the input by the specified scale, and converts the result to a 6-bit BF6 floating-point format with stochastic rounding.

Parameters
xThe float value to be converted.
scaleThe scaling factor applied to the input before conversion.
Returns
The converted bf6_t value.

◆ bf6_convert_sr() [2/2]

__host__ __device__ bf6x32_t ck::bf6_convert_sr ( float32_t x,
float scale = 1.0f )
inline

Converts a vector of 32 floats to the vector of 32 6-bit BF6 types using stochastic rounding.

Divides the input by the specified scale, and converts the result to a 6-bit BF6 floating-point format with stochastic rounding.

Parameters
xThe float vector to be converted.
scaleThe scaling factor applied to the input before conversion.
Returns
The converted bf6x32_t vector.

◆ bit_cast()

template<typename Y, typename X, typename enable_if< sizeof(X)==sizeof(Y), bool >::type = false>
__host__ __device__ constexpr Y ck::bit_cast ( const X & x)
constexpr

◆ block_sync_lds()

__device__ void ck::block_sync_lds ( )

◆ block_sync_lds_direct_load()

__device__ void ck::block_sync_lds_direct_load ( )

◆ BlockGemmABScalePipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
auto ck::BlockGemmABScalePipeline_Selector ( )
constexpr

◆ BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
auto ck::BlockGemmBlockMoeScaleBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmBlockScaleBPreshufflePipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MScaleBlock, index_t NScaleBlock, index_t KScaleBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
auto ck::BlockGemmBlockScaleBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmBPreshufflePipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
auto ck::BlockGemmBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmMXBPreshufflePipeline_Selector() [1/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
auto ck::BlockGemmMXBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmMXBPreshufflePipeline_Selector() [2/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
auto ck::BlockGemmMXBPreshufflePipeline_Selector ( )
constexpr

◆ BlockGemmMXNBSPipeline_Selector()

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
auto ck::BlockGemmMXNBSPipeline_Selector ( )
constexpr

◆ BlockGemmMXPipeline_Selector() [1/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool GUFusion = false>
auto ck::BlockGemmMXPipeline_Selector ( )
constexpr

◆ BlockGemmMXPipeline_Selector() [2/2]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t ThreadBlockSize, index_t ScaleBlockSize, typename ADataType, typename AScaleDataType, typename BDataType, typename BScaleDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
auto ck::BlockGemmMXPipeline_Selector ( )
constexpr

◆ BlockGemmPipeline_Selector() [1/3]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeTypeA, typename ComputeTypeB, typename AccDataType, typename AWmmaTileDesc, typename BWmmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerWmma, index_t NPerWmma, index_t MRepeat, index_t NRepeat, index_t KPack, bool TransposeC = false>
auto ck::BlockGemmPipeline_Selector ( )
constexpr

◆ BlockGemmPipeline_Selector() [2/3]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack>
auto ck::BlockGemmPipeline_Selector ( )
constexpr

◆ BlockGemmPipeline_Selector() [3/3]

template<BlockGemmPipelineVersion BlkGemmPipelineVer, BlockGemmPipelineScheduler BlkGemmPipeSche, index_t BlockSize, typename ADataType, typename BDataType, typename ComputeDataType, typename AccDataType, typename ATileDesc, typename BTileDesc, typename AMmaTileDesc, typename BMmaTileDesc, index_t ABlockTransferSrcScalarPerVector, index_t BBlockTransferSrcScalarPerVector, index_t MPerBlock, index_t NPerBlock, index_t KPerBlock, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, bool DirectLoad = false>
auto ck::BlockGemmPipeline_Selector ( )
constexpr

◆ BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()

template<index_t BlockSize, typename FloatA, typename FloatB, typename FloatAcc, typename AK0MK1BlockDesc, typename BK0NK1BlockDesc, index_t MPerXDL, index_t NPerXDL, index_t MRepeat, index_t NRepeat, index_t KPack, LoopScheduler LoopSched, typename ComputeTypeA = FloatA, typename ComputeTypeB = FloatB>
auto ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector ( )
constexpr

◆ c_style_pointer_cast()

template<typename PY, typename PX, typename enable_if< is_pointer_v< PY > &&is_pointer_v< PX >, bool >::type = false>
__host__ __device__ PY ck::c_style_pointer_cast ( PX p_x)

◆ cast_pointer_to_constant_address_space()

template<typename T>
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE * ck::cast_pointer_to_constant_address_space ( T * p)

◆ cast_pointer_to_generic_address_space()

template<typename T>
__device__ T * ck::cast_pointer_to_generic_address_space ( T CK_CONSTANT_ADDRESS_SPACE * p)

◆ chain_tensor_adaptors() [1/2]

template<typename TensorAdaptor0, typename TensorAdaptor1>
__host__ __device__ constexpr auto ck::chain_tensor_adaptors ( const TensorAdaptor0 & adaptor0,
const TensorAdaptor1 & adaptor1 )
constexpr

◆ chain_tensor_adaptors() [2/2]

template<typename X, typename... Xs, typename enable_if< sizeof...(Xs) > = 2, bool, ::type = false>
__host__ __device__ constexpr auto ck::chain_tensor_adaptors ( const X & x,
const Xs &... xs )
constexpr

◆ clz()

__device__ int ck::clz ( uint32_t x)
inline

◆ concat_tuple() [1/3]

template<typename... X>
__host__ __device__ constexpr auto ck::concat_tuple ( const Tuple< X... > & tx)
constexpr

◆ concat_tuple() [2/3]

template<typename... X, typename... Y>
__host__ __device__ constexpr auto ck::concat_tuple ( const Tuple< X... > & tx,
const Tuple< Y... > & ty )
constexpr

◆ concat_tuple() [3/3]

template<typename... X, typename... Tuples>
__host__ __device__ constexpr auto ck::concat_tuple ( const Tuple< X... > & tx,
const Tuples &... tuples )
constexpr

◆ concat_tuple_of_reference()

template<typename... X, typename... Y>
__host__ __device__ constexpr auto ck::concat_tuple_of_reference ( const Tuple< X &... > & tx,
const Tuple< Y &... > & ty )
constexpr

◆ conditional_expr()

template<bool predicate, typename X, typename Y>
auto ck::conditional_expr ( X && x,
Y && y )
constexpr

◆ container_concat() [1/4]

template<typename T, index_t NX, index_t NY>
__host__ __device__ constexpr auto ck::container_concat ( const Array< T, NX > & ax,
const Array< T, NY > & ay )
constexpr

◆ container_concat() [2/4]

template<typename Container>
__host__ __device__ constexpr auto ck::container_concat ( const Container & x)
constexpr

◆ container_concat() [3/4]

template<typename... X, typename... Y>
__host__ __device__ constexpr auto ck::container_concat ( const Tuple< X... > & tx,
const Tuple< Y... > & ty )
constexpr

◆ container_concat() [4/4]

template<typename X, typename... Ys>
__host__ __device__ constexpr auto ck::container_concat ( const X & x,
const Ys &... ys )
constexpr

◆ container_push_back() [1/2]

template<typename TData, index_t NSize>
__host__ __device__ constexpr auto ck::container_push_back ( const Array< TData, NSize > & a,
const TData & x )
constexpr

◆ container_push_back() [2/2]

template<typename... Ts, typename T>
__host__ __device__ constexpr auto ck::container_push_back ( const Tuple< Ts... > & a,
const T & x )
constexpr

◆ container_push_front()

template<typename... Ts, typename T>
__host__ __device__ constexpr auto ck::container_push_front ( const Tuple< Ts... > & a,
const T & x )
constexpr

◆ container_reduce()

template<typename Container, typename Reduce, typename Init, index_t IBegin = 0, index_t IEnd = Container::Size(), index_t IStep = 1>
__host__ __device__ constexpr auto ck::container_reduce ( const Container & x,
Reduce reduce,
Init init,
Number< IBegin > = Number<0>{},
Number< IEnd > = Number<Container::Size()>{},
Number< IStep > = Number<1>{} )
constexpr

◆ container_reorder_given_new2old() [1/3]

template<typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto ck::container_reorder_given_new2old ( const Array< TData, NSize > & old_array,
Sequence< IRs... >  )
constexpr

◆ container_reorder_given_new2old() [2/3]

template<typename... Ts, index_t... IRs>
__host__ __device__ constexpr auto ck::container_reorder_given_new2old ( const Tuple< Ts... > & old_tuple,
Sequence< IRs... >  )
constexpr

◆ container_reorder_given_new2old() [3/3]

template<index_t... Is, index_t... IRs>
__host__ __device__ constexpr auto ck::container_reorder_given_new2old ( Sequence< Is... > ,
Sequence< IRs... >  )
constexpr

◆ container_reorder_given_old2new() [1/3]

template<typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto ck::container_reorder_given_old2new ( const Array< TData, NSize > & old_array,
Sequence< IRs... > old2new )
constexpr

◆ container_reorder_given_old2new() [2/3]

template<typename... Ts, index_t... IRs>
__host__ __device__ constexpr auto ck::container_reorder_given_old2new ( const Tuple< Ts... > & old_tuple,
Sequence< IRs... > old2new )
constexpr

◆ container_reorder_given_old2new() [3/3]

template<index_t... Is, index_t... IRs>
__host__ __device__ constexpr auto ck::container_reorder_given_old2new ( Sequence< Is... > old_seq,
Sequence< IRs... >  )
constexpr

◆ container_reverse_exclusive_scan() [1/3]

template<typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto ck::container_reverse_exclusive_scan ( const Array< TData, NSize > & x,
Reduce f,
TData init )
constexpr

◆ container_reverse_exclusive_scan() [2/3]

template<index_t... Is, typename Reduce, index_t Init>
__host__ __device__ constexpr auto ck::container_reverse_exclusive_scan ( const Sequence< Is... > & seq,
Reduce f,
Number< Init >  )
constexpr

◆ container_reverse_exclusive_scan() [3/3]

template<typename... Xs, typename Reduce, typename Init>
__host__ __device__ constexpr auto ck::container_reverse_exclusive_scan ( const Tuple< Xs... > & x,
Reduce reduce,
Init init )
constexpr

◆ container_reverse_inclusive_scan() [1/2]

template<typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto ck::container_reverse_inclusive_scan ( const Array< TData, NSize > & x,
Reduce f,
TData init )
constexpr

◆ container_reverse_inclusive_scan() [2/2]

template<typename... Xs, typename Reduce, typename TData>
__host__ __device__ constexpr auto ck::container_reverse_inclusive_scan ( const Tuple< Xs... > & x,
Reduce f,
TData init )
constexpr

◆ coordinate_has_valid_offset()

template<typename TensorDesc, typename TensorCoord>
__host__ __device__ constexpr bool ck::coordinate_has_valid_offset ( const TensorDesc & tensor_desc,
const TensorCoord & coord )
constexpr

◆ coordinate_has_valid_offset_assuming_visible_index_is_valid()

template<typename TensorDesc, typename TensorCoord>
__host__ __device__ constexpr bool ck::coordinate_has_valid_offset_assuming_visible_index_is_valid ( const TensorDesc & tensor_desc,
const TensorCoord & coord )
constexpr

◆ DefaultValidCTileIndex()

template<typename CTileIdx, typename CTileDim>
__host__ __device__ bool ck::DefaultValidCTileIndex ( const CTileIdx & c_tile_idx,
const CTileDim & c_tile_dim )

◆ EnvGetString()

template<class EnvVar>
const std::string & ck::EnvGetString ( EnvVar )
inline

◆ EnvIsDisabled()

template<class EnvVar>
bool ck::EnvIsDisabled ( EnvVar )
inline

◆ EnvIsEnabled()

template<class EnvVar>
bool ck::EnvIsEnabled ( EnvVar )
inline

◆ EnvIsUnset()

template<class EnvVar>
bool ck::EnvIsUnset ( EnvVar )
inline

◆ EnvUnset()

template<class EnvVar>
void ck::EnvUnset ( EnvVar )

◆ EnvValue()

template<class EnvVar>
uint64_t ck::EnvValue ( EnvVar )
inline

◆ f4_convert_rne() [1/2]

__host__ __device__ f4_t ck::f4_convert_rne ( float x,
float scale = 1.0f )
inline

◆ f4_convert_rne() [2/2]

__host__ __device__ f4x32_t ck::f4_convert_rne ( float2_t x,
float scale = 1.0f )
inline

◆ f4_convert_sr() [1/2]

__host__ __device__ f4_t ck::f4_convert_sr ( float x,
float scale = 1.0f )
inline

◆ f4_convert_sr() [2/2]

__host__ __device__ f4x32_t ck::f4_convert_sr ( float2_t x,
float scale = 1.0f )
inline

◆ f6_convert_rne() [1/2]

__host__ __device__ f6_t ck::f6_convert_rne ( float x,
float scale = 1.0f )
inline

Converts a float to a 6-bit float type (f6_t) using round-to-nearest-even.

Divides the input by the specified scale, then saturates and converts it to the 6-bit floating-point format (f6_t).

Parameters
xThe input float value.
scaleA scaling factor applied to x before conversion.
Returns
The converted f6_t value.

◆ f6_convert_rne() [2/2]

__host__ __device__ f6x32_t ck::f6_convert_rne ( float32_t x,
float scale = 1.0f )
inline

Converts a 32-element single-precision float array into a packed 6-bit representation.

This function divides each input float by the provided scale value, then performs conversion with rounding to nearest / even to pack each element into 6 bits of precision.

Parameters
xA vector of 32 floats stored in float32_t.
scaleA scaling factor for each float before conversion.
Returns
An f6x32_t object storing the compressed 6-bit representation.

◆ f6_convert_sr() [1/2]

__host__ __device__ f6_t ck::f6_convert_sr ( float x,
float scale = 1.0f )
inline

Converts a float to the 6-bit floating-point type (f6_t) using stochastic rounding.

Divides the input by the specified scale, then performs saturation and conversion to f6_t based on a pseudo-randomly generated seed.

Parameters
xThe input float value.
scaleA scaling factor applied to x before conversion.
Returns
The converted f6_t value.

◆ f6_convert_sr() [2/2]

__host__ __device__ f6x32_t ck::f6_convert_sr ( float32_t x,
float scale = 1.0f )
inline

Converts a 32-element single-precision float array into a packed 6-bit representation.

This function divides each input float by the provided scale value, then performs conversion with stochastic rounding to pack each element into 6 bits of precision.

Parameters
xA vector of 32 floats stored in float32_t.
scaleA scaling factor for each float before conversion.
Returns
An f6x32_t object storing the compressed 6-bit representation.

◆ f8_convert_rne()

template<typename Y, typename X>
__host__ __device__ constexpr Y ck::f8_convert_rne ( X x)
constexpr

◆ f8_convert_rne< bf8_fnuz_t, float >()

template<>
__host__ __device__ bf8_fnuz_t ck::f8_convert_rne< bf8_fnuz_t, float > ( float x)
inline

◆ f8_convert_rne< bf8_fnuz_t, half_t >()

template<>
__host__ __device__ bf8_fnuz_t ck::f8_convert_rne< bf8_fnuz_t, half_t > ( half_t x)
inline

◆ f8_convert_rne< bf8_ocp_t, bhalf_t >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_rne< bf8_ocp_t, bhalf_t > ( bhalf_t x)
inline

Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input bhalf_t value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_rne< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_rne< bf8_ocp_t, float > ( float x)
inline

Converts a float to a 8-bit float type (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input float value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_rne< bf8_ocp_t, half_t >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_rne< bf8_ocp_t, half_t > ( half_t x)
inline

Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input half_t value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_rne< bf8x2_ocp_t, bhalf2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_rne< bf8x2_ocp_t, bhalf2_t > ( bhalf2_t x)
inline

Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 bhalf_t.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_rne< bf8x2_ocp_t, float2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_rne< bf8x2_ocp_t, float2_t > ( float2_t x)
inline

Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 floats.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_rne< bf8x2_ocp_t, half2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_rne< bf8x2_ocp_t, half2_t > ( half2_t x)
inline

Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 half_t.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_rne< f8_fnuz_t, float >()

template<>
__host__ __device__ f8_fnuz_t ck::f8_convert_rne< f8_fnuz_t, float > ( float x)
inline

◆ f8_convert_rne< f8_fnuz_t, half_t >()

template<>
__host__ __device__ f8_fnuz_t ck::f8_convert_rne< f8_fnuz_t, half_t > ( half_t x)
inline

◆ f8_convert_rne< f8_ocp_t, bhalf_t >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_rne< f8_ocp_t, bhalf_t > ( bhalf_t x)
inline

Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input bhalf_t value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_rne< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_rne< f8_ocp_t, float > ( float x)
inline

Converts a float to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input float value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_rne< f8_ocp_t, half_t >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_rne< f8_ocp_t, half_t > ( half_t x)
inline

Converts a half_t to a 8-bit float type (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input half_t value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_rne< f8x2_ocp_t, bhalf2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_rne< f8x2_ocp_t, bhalf2_t > ( bhalf2_t x)
inline

Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 bhalf_t.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_rne< f8x2_ocp_t, float2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_rne< f8x2_ocp_t, float2_t > ( float2_t x)
inline

Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 floats.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_rne< f8x2_ocp_t, half2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_rne< f8x2_ocp_t, half2_t > ( half2_t x)
inline

Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using rounding to nearest/even.

Parameters
xThe input vector of 2 half_t.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_sr()

template<typename Y, typename X>
__host__ __device__ constexpr Y ck::f8_convert_sr ( X x)
constexpr

◆ f8_convert_sr< bf8_fnuz_t, float >()

template<>
__host__ __device__ bf8_fnuz_t ck::f8_convert_sr< bf8_fnuz_t, float > ( float x)
inline

◆ f8_convert_sr< bf8_fnuz_t, half_t >()

template<>
__host__ __device__ bf8_fnuz_t ck::f8_convert_sr< bf8_fnuz_t, half_t > ( half_t x)
inline

◆ f8_convert_sr< bf8_ocp_t, bhalf_t >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_sr< bf8_ocp_t, bhalf_t > ( bhalf_t x)
inline

Converts a bhalf_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input bhalf_t value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_sr< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_sr< bf8_ocp_t, float > ( float x)
inline

Converts a float to a 8-bit float type (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input float value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_sr< bf8_ocp_t, half_t >()

template<>
__host__ __device__ bf8_ocp_t ck::f8_convert_sr< bf8_ocp_t, half_t > ( half_t x)
inline

Converts a half_t to a 8-bit half_t type (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input half_t value.
Returns
The converted bf8_ocp_t value.

◆ f8_convert_sr< bf8x2_ocp_t, bhalf2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_sr< bf8x2_ocp_t, bhalf2_t > ( bhalf2_t x)
inline

Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 bhalf_t.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_sr< bf8x2_ocp_t, float2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_sr< bf8x2_ocp_t, float2_t > ( float2_t x)
inline

Converts a vector of 2 floats to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 floats.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_sr< bf8x2_ocp_t, half2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::f8_convert_sr< bf8x2_ocp_t, half2_t > ( half2_t x)
inline

Converts a vector of 2 half_t to a vector of 2 8-bit float types (bf8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 half_t.
Returns
The converted vector of 2 bf8_ocp_t.

◆ f8_convert_sr< f8_fnuz_t, float >()

template<>
__host__ __device__ f8_fnuz_t ck::f8_convert_sr< f8_fnuz_t, float > ( float x)
inline

◆ f8_convert_sr< f8_fnuz_t, half_t >()

template<>
__host__ __device__ f8_fnuz_t ck::f8_convert_sr< f8_fnuz_t, half_t > ( half_t x)
inline

◆ f8_convert_sr< f8_ocp_t, bhalf_t >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_sr< f8_ocp_t, bhalf_t > ( bhalf_t x)
inline

Converts a bhalf_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.

Parameters
xThe input bhalf_t value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_sr< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_sr< f8_ocp_t, float > ( float x)
inline

Converts a float to a 8-bit float type (f8_ocp_t) using stochastic rounding.

Parameters
xThe input float value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_sr< f8_ocp_t, half_t >()

template<>
__host__ __device__ f8_ocp_t ck::f8_convert_sr< f8_ocp_t, half_t > ( half_t x)
inline

Converts a half_t to a 8-bit float type (f8_ocp_t) using stochastic rounding.

Parameters
xThe input half_t value.
Returns
The converted f8_ocp_t value.

◆ f8_convert_sr< f8x2_ocp_t, bhalf2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_sr< f8x2_ocp_t, bhalf2_t > ( bhalf2_t x)
inline

Converts a vector of 2 bhalf_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 bhalf_t.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_sr< f8x2_ocp_t, float2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_sr< f8x2_ocp_t, float2_t > ( float2_t x)
inline

Converts a vector of 2 floats to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 floats.
Returns
The converted vector of 2 f8_ocp_t.

◆ f8_convert_sr< f8x2_ocp_t, half2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::f8_convert_sr< f8x2_ocp_t, half2_t > ( half2_t x)
inline

Converts a vector of 2 half_t to a vector of 2 8-bit float types (f8_ocp_t) using stochastic rounding.

Parameters
xThe input vector of 2 half_t.
Returns
The converted vector of 2 f8_ocp_t.

◆ fnv1a_hash()

unsigned int ck::fnv1a_hash ( std::string_view str,
unsigned int h = 2166136261u )
constexpr

◆ fp8_is_inf()

template<>
__host__ __device__ constexpr bool ck::fp8_is_inf ( bf8_ocp_t a)
inlineconstexpr

◆ fp8_is_nan() [1/4]

template<>
__host__ __device__ constexpr bool ck::fp8_is_nan ( bf8_fnuz_t a)
inlineconstexpr

◆ fp8_is_nan() [2/4]

template<>
__host__ __device__ constexpr bool ck::fp8_is_nan ( bf8_ocp_t a)
inlineconstexpr

◆ fp8_is_nan() [3/4]

template<>
__host__ __device__ constexpr bool ck::fp8_is_nan ( f8_fnuz_t a)
inlineconstexpr

◆ fp8_is_nan() [4/4]

template<>
__host__ __device__ constexpr bool ck::fp8_is_nan ( f8_ocp_t a)
inlineconstexpr

◆ generate_sequence()

template<typename F, index_t N>
__host__ __device__ constexpr auto ck::generate_sequence ( F ,
Number< N >  )
constexpr

◆ generate_sequence_v2()

template<typename F, index_t N>
__host__ __device__ constexpr auto ck::generate_sequence_v2 ( F && f,
Number< N >  )
constexpr

◆ generate_tie()

template<typename F, index_t N>
__host__ __device__ constexpr auto ck::generate_tie ( F && f,
Number< N >  )
constexpr

◆ generate_tuple() [1/2]

template<typename F, index_t N>
__host__ __device__ constexpr auto ck::generate_tuple ( F && f,
LongNumber< N >  )
constexpr

◆ generate_tuple() [2/2]

template<typename F, index_t N>
__host__ __device__ constexpr auto ck::generate_tuple ( F && f,
Number< N >  )
constexpr

◆ generate_tuple_for()

template<typename F, index_t... ids>
__host__ __device__ constexpr auto ck::generate_tuple_for ( F && f,
Sequence< ids... >  )
constexpr

◆ get_available_cpu_cores()

unsigned int ck::get_available_cpu_cores ( )
inline

◆ get_block_1d_id()

__device__ index_t ck::get_block_1d_id ( )

◆ get_block_size()

__device__ index_t ck::get_block_size ( )

◆ get_container_subset() [1/2]

template<typename T, index_t N, index_t... Is>
__host__ __device__ constexpr auto ck::get_container_subset ( const Array< T, N > & arr,
Sequence< Is... >  )
constexpr

◆ get_container_subset() [2/2]

template<typename... Ts, index_t... Is>
__host__ __device__ constexpr auto ck::get_container_subset ( const Tuple< Ts... > & tup,
Sequence< Is... >  )
constexpr

◆ get_device_name()

std::string ck::get_device_name ( )
inline

◆ get_grid_size()

__device__ index_t ck::get_grid_size ( )

◆ get_shift< 1 >()

template<>
__device__ index_t ck::get_shift< 1 > ( )
constexpr

◆ get_thread_global_1d_id()

__device__ index_t ck::get_thread_global_1d_id ( )

◆ get_thread_local_1d_id()

__device__ index_t ck::get_thread_local_1d_id ( )

◆ get_type_name()

template<typename T>
const char * ck::get_type_name ( )
inline

◆ get_warp_local_1d_id()

__device__ index_t ck::get_warp_local_1d_id ( )

◆ get_warp_size()

__host__ index_t ck::get_warp_size ( )
inlineconstexpr

◆ GridwiseGemmPipeline_Selector()

template<PipelineVersion PipelineVer, index_t NumPrefetch = 1, LoopScheduler LoopSched = LoopScheduler::Default, bool AEnableLds = true, bool BEnableLds = true>
auto ck::GridwiseGemmPipeline_Selector ( )
constexpr

◆ GridwiseGemmPipeline_v1_Selector()

template<index_t NumPrefetch, LoopScheduler LoopSched>
auto ck::GridwiseGemmPipeline_v1_Selector ( )
constexpr

◆ i4_to_bhalf4()

__device__ bhalf4_t ck::i4_to_bhalf4 ( int q)
inline

◆ i4_to_f8x4()

__device__ f8x4_t ck::i4_to_f8x4 ( int q)
inline

◆ i4_to_fp8x8()

__device__ f8x8_t ck::i4_to_fp8x8 ( int q)
inline

◆ i4_to_half4()

__device__ half4_t ck::i4_to_half4 ( int q)
inline

◆ i4_to_half4_scale()

__device__ half4_t ck::i4_to_half4_scale ( int q,
const ck::half2_t & scale )
inline

◆ inclusive_scan_sequence()

template<typename Seq, typename Reduce, index_t Init>
__host__ __device__ constexpr auto ck::inclusive_scan_sequence ( Seq ,
Reduce ,
Number< Init >  )
constexpr

◆ inner_product()

template<typename TA, typename TB, typename TC>
__device__ void ck::inner_product ( const TA & a,
const TB & b,
TC & c )

◆ inner_product< bhalf_t, bhalf_t, float >()

template<>
__device__ void ck::inner_product< bhalf_t, bhalf_t, float > ( const bhalf_t & a,
const bhalf_t & b,
float & c )

◆ inner_product< float, float, float >()

template<>
__device__ void ck::inner_product< float, float, float > ( const float & a,
const float & b,
float & c )

◆ inner_product< float2_t, float2_t, float >()

template<>
__device__ void ck::inner_product< float2_t, float2_t, float > ( const float2_t & a,
const float2_t & b,
float & c )

◆ inner_product< float4_t, float4_t, float >()

template<>
__device__ void ck::inner_product< float4_t, float4_t, float > ( const float4_t & a,
const float4_t & b,
float & c )

◆ inner_product< half2_t, half2_t, float >()

template<>
__device__ void ck::inner_product< half2_t, half2_t, float > ( const half2_t & a,
const half2_t & b,
float & c )

◆ inner_product< half4_t, half4_t, float >()

template<>
__device__ void ck::inner_product< half4_t, half4_t, float > ( const half4_t & a,
const half4_t & b,
float & c )

◆ inner_product< half8_t, half8_t, float >()

template<>
__device__ void ck::inner_product< half8_t, half8_t, float > ( const half8_t & a,
const half8_t & b,
float & c )

◆ inner_product< half_t, half_t, float >()

template<>
__device__ void ck::inner_product< half_t, half_t, float > ( const half_t & a,
const half_t & b,
float & c )

◆ inner_product< int8_t, int8_t, int32_t >()

template<>
__device__ void ck::inner_product< int8_t, int8_t, int32_t > ( const int8_t & a,
const int8_t & b,
int32_t & c )

◆ inner_product< int8x16_t, int8x16_t, int32_t >()

template<>
__device__ void ck::inner_product< int8x16_t, int8x16_t, int32_t > ( const int8x16_t & a,
const int8x16_t & b,
int32_t & c )

◆ inner_product< int8x2_t, int8x2_t, int32_t >()

template<>
__device__ void ck::inner_product< int8x2_t, int8x2_t, int32_t > ( const int8x2_t & a,
const int8x2_t & b,
int32_t & c )

◆ inner_product< int8x4_t, int8x4_t, int32_t >()

template<>
__device__ void ck::inner_product< int8x4_t, int8x4_t, int32_t > ( const int8x4_t & a,
const int8x4_t & b,
int32_t & c )

◆ inner_product< int8x8_t, int8x8_t, int32_t >()

template<>
__device__ void ck::inner_product< int8x8_t, int8x8_t, int32_t > ( const int8x8_t & a,
const int8x8_t & b,
int32_t & c )

◆ is_bf16_atomic_supported()

bool ck::is_bf16_atomic_supported ( )
inline

◆ is_gfx101_supported()

bool ck::is_gfx101_supported ( )
inline

◆ is_gfx103_supported()

bool ck::is_gfx103_supported ( )
inline

◆ is_gfx11_supported()

bool ck::is_gfx11_supported ( )
inline

◆ is_gfx12_supported()

bool ck::is_gfx12_supported ( )
inline

◆ is_lds_direct_load_supported()

bool ck::is_lds_direct_load_supported ( )
inline

◆ is_native_type()

template<typename T>
bool ck::is_native_type ( )
inlineconstexpr

◆ is_tf32_supported()

bool ck::is_tf32_supported ( )
inline

◆ is_wmma_supported()

bool ck::is_wmma_supported ( )
inline

◆ is_xdl_supported()

bool ck::is_xdl_supported ( )
inline

◆ is_xdl_wmma_supported()

template<typename ADataType, typename BDataType, index_t MPerXDL, index_t NPerXDL>
bool ck::is_xdl_wmma_supported ( )
inline

◆ IsNestedTuple()

template<typename... Ts>
__host__ __device__ constexpr auto ck::IsNestedTuple ( const Tuple< Ts... > & )
constexpr

◆ kernel_batched_elementwise()

template<typename GridwiseElementwiseFunctor, typename InGridDescTuple, typename OutGridDescTuple, typename InDataTypePointerTuple, typename OutDataTypePointerTuple, typename Block2TileMap, typename ElementwiseOperation, index_t NumInputs, index_t NumOutputs>
__global__ void ck::kernel_batched_elementwise ( const InGridDescTuple in_grid_desc_tuple,
const OutGridDescTuple out_grid_desc_tuple,
const InDataTypePointerTuple p_in_global_tuple,
const OutDataTypePointerTuple p_out_global_tuple,
const Block2TileMap block_2_tile_map,
const ElementwiseOperation elementwise_op,
const index_t batch_count,
const std::array< index_t, NumInputs > input_batch_strides,
const std::array< index_t, NumOutputs > output_batch_strides )

◆ kernel_batched_gemm_b_scale_xdl_cshuffle_v3()

template<typename GridwiseGemm, typename BatchedGemmArg, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_batched_gemm_b_scale_xdl_cshuffle_v3 ( BatchedGemmArg karg)

◆ kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds()

template<typename GridwiseGemm, typename BatchedGemmArg, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_batched_gemm_b_scale_xdl_cshuffle_v3_2lds ( BatchedGemmArg karg)

◆ kernel_batched_gemm_xdl_cshuffle_v3_multi_d()

template<typename GridwiseGemm, typename BatchedGemmArg, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_batched_gemm_xdl_cshuffle_v3_multi_d ( BatchedGemmArg karg)

◆ kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds()

template<typename GridwiseGemm, typename BatchedGemmArg, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_batched_gemm_xdl_cshuffle_v3_multi_d_2lds ( BatchedGemmArg karg)

◆ kernel_batchnorm_backward_with_blockwise_welford()

template<typename GridwiseBatchrNormBackwardWithBlockwiseWelford_, typename XDataType, typename DyDataType, typename DxDataType, typename AccDataType, typename ScaleDataType, typename DscaleDbiasDataType, typename MeanVarDataType, typename DyElementwiseOp, typename XYGridDesc_M_K, typename ScaleBiasGridDesc_M, typename MeanVarGridDesc_M, typename GetReduceCountPerThreadFunctor>
__global__ void ck::kernel_batchnorm_backward_with_blockwise_welford ( const XYGridDesc_M_K x_grid_desc_m_k,
const XYGridDesc_M_K dy_grid_desc_m_k,
const XYGridDesc_M_K dx_grid_desc_m_k,
const ScaleBiasGridDesc_M scale_grid_desc_m,
const ScaleBiasGridDesc_M dscale_dbias_grid_desc_m,
const MeanVarGridDesc_M mean_var_grid_desc_m,
const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
long_index_t reduce_size,
index_t num_k_block_tile_iteration,
AccDataType epsilon,
const XDataType *const __restrict__ p_x,
const DyDataType *const __restrict__ p_dy,
const ScaleDataType *const __restrict__ p_scale,
bool haveSavedMeanInvVar,
const MeanVarDataType *const __restrict__ p_savedMean,
const MeanVarDataType *const __restrict__ p_savedInvVar,
const DyElementwiseOp dy_elementwise_op,
DxDataType *const __restrict__ p_dx,
DscaleDbiasDataType *const __restrict__ p_dscale,
DscaleDbiasDataType *const __restrict__ p_dbias )

◆ kernel_batchnorm_forward_with_blockwise_welford()

template<typename GridwiseBatchrNormForwardWithBlockwiseWelford_, typename XDataType, typename YDataType, typename AccDataType, typename ScaleDataType, typename BiasDataType, typename MeanVarDataType, typename YElementwiseOp, typename XYGridDesc_M_K, typename ScaleBiasGridDesc_M, typename MeanVarGridDesc_M, typename GetReduceCountPerThreadFunctor>
__global__ void ck::kernel_batchnorm_forward_with_blockwise_welford ( const XYGridDesc_M_K x_grid_desc_m_k,
const XYGridDesc_M_K y_grid_desc_m_k,
const ScaleBiasGridDesc_M scale_grid_desc_m,
const ScaleBiasGridDesc_M bias_grid_desc_m,
const MeanVarGridDesc_M mean_var_grid_desc_m,
const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
index_t num_k_block_tile_iteration,
AccDataType epsilon,
const XDataType *const __restrict__ p_x,
const ScaleDataType *const __restrict__ p_scale,
const BiasDataType *const __restrict__ p_bias,
const YElementwiseOp y_elementwise_op,
YDataType *const __restrict__ p_y,
bool updateMovingAverage,
AccDataType averageFactor,
MeanVarDataType *const __restrict__ resultRunningMean,
MeanVarDataType *const __restrict__ resultRunningVariance,
bool saveMeanInvVariance,
MeanVarDataType *const __restrict__ resultSaveMean,
MeanVarDataType *const __restrict__ resultSaveInvVariance )

◆ kernel_buffer_set_value()

template<index_t BlockSize, typename DataType, typename Grid1dBufferDescType>
__global__ void ck::kernel_buffer_set_value ( const Grid1dBufferDescType grid_1d_buffer_desc,
DataType *const __restrict__ p_global,
DataType value )

◆ kernel_contraction_multiple_abd_xdl_cshuffle()

template<typename GridwiseGemm, typename AsPointer, typename BsPointer, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AsGridDesc_AK0_M_AK1, typename BsGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_abd_xdl_cshuffle ( AsPointer p_as_grid,
BsPointer p_bs_grid,
DsPointer p_ds_grid,
EDataType *__restrict__ p_e_grid,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op,
const AsGridDesc_AK0_M_AK1 as_grid_desc_ak0_m_ak1,
const BsGridDesc_BK0_N_BK1 bs_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap block_2_etile_map )

◆ kernel_contraction_multiple_d_wmma_cshuffle()

template<typename GridwiseOp, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AGridDesc, typename BGridDesc, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename ComputePtrOffsetOfBatch, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_d_wmma_cshuffle ( const ADataType *__restrict__ p_a_grid,
const BDataType *__restrict__ p_b_grid,
DsPointer p_ds_grid,
EDataType *__restrict__ p_e_grid,
const index_t batch_count,
const AGridDesc a_grid_desc,
const BGridDesc b_grid_desc,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op,
const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
const Block2CTileMap block_2_etile_map )

◆ kernel_contraction_multiple_d_xdl_cshuffle() [1/3]

template<typename GridwiseGemm, typename FloatAB, typename FloatDsPointer, typename FloatE, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_d_xdl_cshuffle ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatDsPointer p_ds_grid,
FloatE *__restrict__ p_e_grid,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op,
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap block_2_etile_map )

◆ kernel_contraction_multiple_d_xdl_cshuffle() [2/3]

template<typename GridwiseGemm, typename FloatAB, typename FloatDsPointer, typename FloatE, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename ComputePtrOffsetOfBatch, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_d_xdl_cshuffle ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatDsPointer p_ds_grid,
FloatE *__restrict__ p_e_grid,
const index_t batch_count,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op,
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock,
const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
const Block2ETileMap block_2_etile_map )

◆ kernel_contraction_multiple_d_xdl_cshuffle() [3/3]

template<typename GridwiseGemm, typename FloatAB, typename FloatDsPointer, typename FloatE, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AKB_AK0_M_AK1, typename BGridDesc_BKB_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename ComputePtrOffsetOfBatch, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_contraction_multiple_d_xdl_cshuffle ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatDsPointer p_ds_grid,
FloatE *__restrict__ p_e_grid,
const index_t batch_count,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op,
const AGridDesc_AKB_AK0_M_AK1 a_grid_desc_akb_ak0_m_ak1,
const BGridDesc_BKB_BK0_N_BK1 b_grid_desc_bkb_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock,
const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
const Block2ETileMap block_2_etile_map )

◆ kernel_elementwise()

template<typename GridwiseElementwiseFunctor, typename InGridDescTuple, typename OutGridDescTuple, typename InDataTypePointerTuple, typename OutDataTypePointerTuple, typename Block2TileMap, typename ElementwiseOperation>
__global__ void ck::kernel_elementwise ( const InGridDescTuple in_grid_desc_tuple,
const OutGridDescTuple out_grid_desc_tuple,
const InDataTypePointerTuple p_in_global_tuple,
const OutDataTypePointerTuple p_out_global_tuple,
const Block2TileMap block_2_tile_map,
const ElementwiseOperation elementwise_op )

◆ kernel_elementwise_1d()

template<typename GridwiseElementwise1dFunctor, typename InGrid1dDescTuple, typename OutGrid1dDescTuple, typename InDataTypePointerTuple, typename OutDataTypePointerTuple, typename ElementwiseOperation, typename UnaryOperation, typename Scale>
__global__ void ck::kernel_elementwise_1d ( const InGrid1dDescTuple in_grid_1d_desc_tuple,
const OutGrid1dDescTuple out_grid_1d_desc_tuple,
const InDataTypePointerTuple p_in_global_tuple,
const OutDataTypePointerTuple p_out_global_tuple,
const ElementwiseOperation elementwise_op,
const UnaryOperation unary_op,
const Scale scale_op )

◆ kernel_elementwise_batched_dual()

template<typename GridwiseElementwiseFunctorA, typename GridwiseElementwiseFunctorB, typename InAGridDescTuple, typename InBGridDescTuple, typename OutAGridDescTuple, typename OutBGridDescTuple, typename InADataTypePointerTuple, typename InBDataTypePointerTuple, typename OutADataTypePointerTuple, typename OutBDataTypePointerTuple, typename Block2TileMapA, typename Block2TileMapB, typename ElementwiseOperation, index_t NumInputsA, index_t NumInputsB, index_t NumOutputsA, index_t NumOutputsB>
__global__ void ck::kernel_elementwise_batched_dual ( const InAGridDescTuple in_grid_desc_tuple_a,
const InBGridDescTuple in_grid_desc_tuple_b,
const OutAGridDescTuple out_grid_desc_tuple_a,
const OutBGridDescTuple out_grid_desc_tuple_b,
const InADataTypePointerTuple p_in_global_tuple_a,
const InBDataTypePointerTuple p_in_global_tuple_b,
const OutADataTypePointerTuple p_out_global_tuple_a,
const OutBDataTypePointerTuple p_out_global_tuple_b,
const Block2TileMapA block_2_tile_map_a,
const Block2TileMapB block_2_tile_map_b,
const ElementwiseOperation elementwise_op,
const index_t a_grid_size,
const index_t batch_count_a,
const index_t batch_count_b,
const std::array< index_t, NumInputsA > input_batch_strides_a,
const std::array< index_t, NumInputsB > input_batch_strides_b,
const std::array< index_t, NumOutputsA > output_batch_strides_a,
const std::array< index_t, NumOutputsB > output_batch_strides_b )

◆ kernel_elementwise_dual()

template<typename GridwiseElementwiseFunctorA, typename GridwiseElementwiseFunctorB, typename InAGridDescTuple, typename InBGridDescTuple, typename OutAGridDescTuple, typename OutBGridDescTuple, typename InADataTypePointerTuple, typename InBDataTypePointerTuple, typename OutADataTypePointerTuple, typename OutBDataTypePointerTuple, typename Block2TileMapA, typename Block2TileMapB, typename ElementwiseOperation>
__global__ void ck::kernel_elementwise_dual ( const InAGridDescTuple in_grid_desc_tuple_a,
const InBGridDescTuple in_grid_desc_tuple_b,
const OutAGridDescTuple out_grid_desc_tuple_a,
const OutBGridDescTuple out_grid_desc_tuple_b,
const InADataTypePointerTuple p_in_global_tuple_a,
const InBDataTypePointerTuple p_in_global_tuple_b,
const OutADataTypePointerTuple p_out_global_tuple_a,
const OutBDataTypePointerTuple p_out_global_tuple_b,
const Block2TileMapA block_2_tile_map_a,
const Block2TileMapB block_2_tile_map_b,
const ElementwiseOperation elementwise_op,
const index_t a_grid_size )

◆ kernel_elementwise_layernorm()

template<typename GridwiseElementwiseReduction, typename InDataTypePointerTuple, typename XDataType, typename GammaDataType, typename BetaDataType, typename YDataType, typename AccDataType, typename XElementwiseOperation, typename YElementwiseOperation, typename InGrid2dDescTuple, typename GridDesc_M_K>
__global__ void ck::kernel_elementwise_layernorm ( const InGrid2dDescTuple in_grid_2d_desc_tuple,
const GridDesc_M_K x_grid_desc_m_k,
const GridDesc_M_K gamma_grid_desc_m_k,
const GridDesc_M_K beta_grid_desc_m_k,
const GridDesc_M_K y_grid_desc_m_k,
index_t num_k_block_tile_iteration,
AccDataType epsilon,
const InDataTypePointerTuple p_in_global_tuple,
const GammaDataType *const __restrict__ p_gamma_global,
const BetaDataType *const __restrict__ p_beta_global,
YDataType *const __restrict__ p_y_global,
const XElementwiseOperation x_elementwise_op,
const YElementwiseOperation y_elementwise_op )

◆ kernel_fpAintB_gemm_wmma()

template<typename GridwiseGemm, typename ADataType, typename BDataType, typename ScaleDataType, typename CDataType, typename AGridDesc, typename BGridDesc, typename ScaleGridDesc, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_fpAintB_gemm_wmma ( const ADataType *__restrict__ p_a_grid,
const BDataType *__restrict__ p_b_grid,
const ScaleDataType *__restrict__ p_scale_grid,
CDataType *__restrict__ p_c_grid,
const AGridDesc a_grid_desc,
const BGridDesc b_grid_desc,
const ScaleGridDesc scale_grid_desc,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CElementwiseOperation c_element_op,
const Block2CTileMap block_2_ctile_map )

◆ kernel_gemm_bias_add_reduce_xdl_cshuffle_v1()

template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename FloatC0, typename FloatC1, typename ReducePtrsGlobal, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename C1ElementwiseOperation, typename ReduceInElementwiseOperations, typename ReduceAccElementwiseOperations, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename ReduceGridDescriptor_MBlock_MPerBlock, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_bias_add_reduce_xdl_cshuffle_v1 ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
const FloatC0 *__restrict__ p_bias_grid,
const FloatC1 *__restrict__ p_d0_grid,
ReducePtrsGlobal p_reduces_grid,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CElementwiseOperation c_element_op,
const C1ElementwiseOperation c1_element_op,
const ReduceInElementwiseOperations reduce_in_element_ops,
const ReduceAccElementwiseOperations reduce_out_element_ops,
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock,
const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c0_grid_desc_mblock_mperblock_nblock_nperblock,
const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c1_grid_desc_mblock_mperblock_nblock_nperblock,
const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
const Block2CTileMap block_2_ctile_map )

◆ kernel_gemm_dl_multiple_d()

template<typename GridwiseGemm, typename ABDataType, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_K0_M0_M1_K1, typename BGridDesc_K0_N0_N1_K1, typename DsGridDesc_M0_M10_M11_N0_N10_N11, typename CGridDesc_M0_M10_M11_N0_N10_N11, typename Block2CTileMap, bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__global__ void ck::kernel_gemm_dl_multiple_d ( const ABDataType *__restrict__ p_a_grid,
const ABDataType *__restrict__ p_b_grid,
DsPointer p_ds_grid,
EDataType *__restrict__ p_e_grid,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op,
const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
const Block2CTileMap block_2_ctile_map )

◆ kernel_gemm_dl_v1r3()

template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M0_M1_K1, typename BGridDesc_K0_N0_N1_K1, typename CGridDesc_M0_M10_M11_N0_N10_N11, typename Block2CTileMap, bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
__global__ void ck::kernel_gemm_dl_v1r3 ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11,
const Block2CTileMap block_2_ctile_map )

◆ kernel_gemm_dpp()

template<typename GridwiseGemm, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_dpp ( const typename GridwiseGemm::Argument karg)

◆ kernel_gemm_layernorm_xdl_cshuffle_v1()

template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename FloatC0, typename AElementwiseOperation, typename BElementwiseOperation, typename AccElementwiseOperation, typename CElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename C0GridDescriptor_NBlock_NPerBlock, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_layernorm_xdl_cshuffle_v1 ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
const FloatC0 *__restrict__ p_c0_bias_grid,
const FloatC0 *__restrict__ p_c0_add_grid,
const FloatC0 *__restrict__ p_c0_gamma_grid,
const FloatC0 *__restrict__ p_c0_beta_grid,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const AccElementwiseOperation acc_element_op,
const CElementwiseOperation c_element_op,
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock,
const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock,
const Block2CTileMap block_2_ctile_map )

◆ kernel_gemm_multiple_d_multiple_r_xdl_cshuffle()

template<typename GridwiseGemm, typename FloatAB, typename FloatDsPointer, typename FloatE, typename FloatRsPointer, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename QsElementwiseOperation, typename RsElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename RsGridDescriptor_MBlock_MPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_multiple_d_multiple_r_xdl_cshuffle ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatDsPointer p_ds_grid,
FloatE *__restrict__ p_e_grid,
FloatRsPointer p_rs_grid,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op,
const QsElementwiseOperation qs_element_op,
const RsElementwiseOperation rs_element_op,
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock,
const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock,
const Block2ETileMap block_2_etile_map )

◆ kernel_gemm_multiple_d_welford_first_half_wmma_cshuffle_v3()

template<typename GridwiseGemm, typename EMeanVarDataType, bool HasMainKBlockLoop, InMemoryDataOperationEnum EGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_multiple_d_welford_first_half_wmma_cshuffle_v3 ( typename GridwiseGemm::Argument karg,
EMeanVarDataType *__restrict__ p_welford_mean_grid,
EMeanVarDataType *__restrict__ p_welford_var_grid,
int32_t *__restrict__ p_welford_count_grid )

◆ kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle()

template<typename GridwiseGemmWelford, typename ABDataType, typename DsPointer, typename EMeanVarDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename MeanVarGridDescriptor_MBlock_MPerBlock_NBlock, typename CountGridDescriptor_MBlock_MPerBlock_NBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_multiple_d_welford_first_half_xdl_cshuffle ( const ABDataType *__restrict__ p_a_grid,
const ABDataType *__restrict__ p_b_grid,
DsPointer p_ds_grid,
EMeanVarDataType *__restrict__ p_e_grid,
EMeanVarDataType *__restrict__ p_welford_mean_grid,
EMeanVarDataType *__restrict__ p_welford_var_grid,
int32_t *__restrict__ p_welford_count_grid,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op,
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock,
const MeanVarGridDescriptor_MBlock_MPerBlock_NBlock mean_var_grid_desc_mblock_mperblock_nblock,
const CountGridDescriptor_MBlock_MPerBlock_NBlock count_grid_desc_mblock_mperblock_nblock,
const Block2ETileMap block_2_etile_map,
index_t NRaw )

◆ kernel_gemm_multiple_d_xdl_cshuffle()

template<typename GridwiseGemm, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_multiple_d_xdl_cshuffle ( const ADataType *__restrict__ p_a_grid,
const BDataType *__restrict__ p_b_grid,
DsPointer p_ds_grid,
EDataType *__restrict__ p_e_grid,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op,
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap block_2_etile_map )

◆ kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load()

template<typename GridwiseGemm, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_multiple_d_xdl_cshuffle_lds_direct_load ( const ADataType *__restrict__ p_a_grid,
const BDataType *__restrict__ p_b_grid,
DsPointer p_ds_grid,
EDataType *__restrict__ p_e_grid,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op,
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap block_2_etile_map )

◆ kernel_gemm_mupltipe_d_wmma_cshuffle()

template<typename GridwiseOp, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AGridDesc, typename BGridDesc, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_mupltipe_d_wmma_cshuffle ( const ADataType *__restrict__ p_a_grid,
const BDataType *__restrict__ p_b_grid,
DsPointer p_ds_grid,
EDataType *__restrict__ p_e_grid,
const AGridDesc a_grid_desc,
const BGridDesc b_grid_desc,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op,
const Block2CTileMap block_2_ctile_map )

◆ kernel_gemm_reduce_xdl_cshuffle_v1()

template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename ReducePtrsGlobal, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename ReduceInElementwiseOperations, typename ReduceAccElementwiseOperations, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename ReduceGridDescriptor_MBlock_MPerBlock, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_reduce_xdl_cshuffle_v1 ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
ReducePtrsGlobal p_reduces_grid,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CElementwiseOperation c_element_op,
const ReduceInElementwiseOperations reduce_in_element_ops,
const ReduceAccElementwiseOperations reduce_out_element_ops,
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock,
const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
const Block2CTileMap block_2_ctile_map )

◆ kernel_gemm_wmma()

template<typename GridwiseGemm, typename ADataType, typename BDataType, typename CDataType, typename AGridDesc, typename BGridDesc, typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_wmma ( const ADataType *__restrict__ p_a_grid,
const BDataType *__restrict__ p_b_grid,
CDataType *__restrict__ p_c_grid,
const AGridDesc a_grid_desc,
const BGridDesc b_grid_desc,
const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CElementwiseOperation c_element_op,
const Block2CTileMap block_2_ctile_map )

◆ kernel_gemm_wmma_cshuffle_v3()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum EGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_wmma_cshuffle_v3 ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v1() [1/2]

template<typename GridwiseGemm, typename FloatA, typename FloatB, typename FloatC, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdl_cshuffle_v1 ( const FloatA *__restrict__ p_a_grid,
const FloatB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
typename GridwiseGemm::Problem problem )

◆ kernel_gemm_xdl_cshuffle_v1() [2/2]

template<typename GridwiseGemm, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdl_cshuffle_v1 ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v2() [1/2]

template<typename GridwiseGemm, typename FloatA, typename FloatB, typename FloatC, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdl_cshuffle_v2 ( const FloatA * p_a_grid,
const FloatB * p_b_grid,
FloatC * p_c_grid,
typename GridwiseGemm::Problem problem )

◆ kernel_gemm_xdl_cshuffle_v2() [2/2]

template<typename GridwiseGemm, bool HasMainKBlockLoop, index_t TailNum = 3>
__global__ void ck::kernel_gemm_xdl_cshuffle_v2 ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v3()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3 ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v3_2lds()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_2lds ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v3_b_preshuffle()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_b_preshuffle ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_b_preshuffle_2lds ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_2lds()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_2lds ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_gemm_xdl_cshuffle_v3_multi_d_blockscale_b_preshuffle_2lds ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v3_mx() [1/2]

template<bool Use2LDS, typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ enable_if_t< Use2LDS, void > ck::kernel_gemm_xdl_cshuffle_v3_mx ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_cshuffle_v3_mx() [2/2]

template<bool Use2LDS, typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Full>
__global__ enable_if_t<!Use2LDS, void > ck::kernel_gemm_xdl_cshuffle_v3_mx ( typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdl_waveletmodel_cshuffle()

template<typename GridwiseGemm, typename ABDataType, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename EElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2ETileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdl_waveletmodel_cshuffle ( const ABDataType *__restrict__ p_a_grid,
const ABDataType *__restrict__ p_b_grid,
EDataType *__restrict__ p_e_grid,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const EElementwiseOperation e_element_op,
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock,
const Block2ETileMap block_2_etile_map )

◆ kernel_gemm_xdlops_bwd_weight()

template<typename GridwiseGemm, typename FloatA, typename FloatB, typename FloatC, typename AGridDesc_B_K0_M_K1, typename BGridDesc_B_K0_N_K1, typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename CBlockClusterAdaptor, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_bwd_weight ( const FloatA *__restrict__ p_a_grid,
const FloatB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CElementwiseOperation c_element_op,
const CBlockClusterAdaptor c_block_cluster_adaptor )

◆ kernel_gemm_xdlops_skip_b_lds_v1()

template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainK0BlockLoop>
__global__ void ck::kernel_gemm_xdlops_skip_b_lds_v1 ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
const CGridDesc_M_N c_grid_desc_m_n,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CElementwiseOperation c_element_op,
const Block2CTileMap block_2_ctile_map )

◆ kernel_gemm_xdlops_splitk_lds_direct_load()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation>
__global__ void ck::kernel_gemm_xdlops_splitk_lds_direct_load ( typename GridwiseGemm::Argument karg,
const Block2CTileMap & b2c_map,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CElementwiseOperation c_element_op )

◆ kernel_gemm_xdlops_streamk()

template<typename GridwiseGemm>
__global__ void ck::kernel_gemm_xdlops_streamk ( const typename GridwiseGemm::FloatAB * p_a_grid,
const typename GridwiseGemm::FloatAB * p_b_grid,
typename GridwiseGemm::FloatC * p_c_grid,
void * p_workspace,
index_t M,
index_t N,
index_t K,
index_t StrideA,
index_t StrideB,
index_t StrideC,
typename GridwiseGemm::Block2CTileMap block_mapping )

◆ kernel_gemm_xdlops_v2r3() [1/2]

template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDesc_M_N, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v2r3 ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
const CGridDesc_M_N c_grid_desc_m_n )

◆ kernel_gemm_xdlops_v2r3() [2/2]

template<typename GridwiseGemm, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v2r3 ( const typename GridwiseGemm::Argument karg)

◆ kernel_gemm_xdlops_v2r4()

template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename ABK0MK1GridDesc, typename BBK0NK1GridDesc, typename CM0N0M1N1M2M3M4N2GridDesc, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename CBlockClusterAdaptor, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v2r4 ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
const ABK0MK1GridDesc a_b_k0_m_k1_grid_desc,
const BBK0NK1GridDesc b_b_k0_n_k1_grid_desc,
const CM0N0M1N1M2M3M4N2GridDesc c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CElementwiseOperation c_element_op,
const CBlockClusterAdaptor c_block_cluster_adaptor )

◆ kernel_gemm_xdlops_v2r4r2_simplified()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, typename Block2CTileMap, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation>
__global__ void ck::kernel_gemm_xdlops_v2r4r2_simplified ( typename GridwiseGemm::Argument karg,
const Block2CTileMap & b2c_map,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CElementwiseOperation c_element_op )

◆ kernel_gemm_xdlops_v3r1()

template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainK0BlockLoop>
__global__ void ck::kernel_gemm_xdlops_v3r1 ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CElementwiseOperation c_element_op,
const Block2CTileMap block_2_ctile_map )

◆ kernel_gemm_xdlops_v3r2()

template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v3r2 ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
const FloatC *__restrict__ p_c0_grid,
const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CElementwiseOperation c_element_op,
const Block2CTileMap block_2_ctile_map )

◆ kernel_gemm_xdlops_v3r3()

template<typename GridwiseGemm, typename FloatAB, typename FloatC, typename AGridDesc_K0_M_K1, typename BGridDesc_K0_N_K1, typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl, typename AElementwiseOperation, typename BElementwiseOperation, typename CElementwiseOperation, typename Block2CTileMap, bool HasMainKBlockLoop>
__global__ void ck::kernel_gemm_xdlops_v3r3 ( const FloatAB *__restrict__ p_a_grid,
const FloatAB *__restrict__ p_b_grid,
FloatC *__restrict__ p_c_grid,
const FloatC *__restrict__ p_c0_grid,
const FloatC *__restrict__ p_c1_grid,
const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CElementwiseOperation c_element_op,
const Block2CTileMap block_2_ctile_map )

◆ kernel_grouped_contraction_multiple_d_xdl_cshuffle()

template<typename GridwiseGemm, typename ContractionMultiDKernelArg, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, bool HasMainKBlockLoop>
__global__ void ck::kernel_grouped_contraction_multiple_d_xdl_cshuffle ( const void CK_CONSTANT_ADDRESS_SPACE * contraction_args,
const index_t group_count,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op )

◆ kernel_grouped_conv_multiple_d_wmma_cshuffle()

template<typename GridwiseOp, typename ADataType, typename BDataType, typename DsPointer, typename EDataType, typename AElementwiseOperation, typename BElementwiseOperation, typename CDEElementwiseOperation, typename AGridDesc_AK0_M_AK1, typename BGridDesc_BK0_N_BK1, typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, typename Block2CTileMap, typename ComputePtrOffsetOfBatch, bool HasMainKBlockLoop>
__global__ void ck::kernel_grouped_conv_multiple_d_wmma_cshuffle ( const ADataType *__restrict__ p_a_grid,
const BDataType *__restrict__ p_b_grid,
DsPointer p_ds_grid,
EDataType *__restrict__ p_e_grid,
const AElementwiseOperation a_element_op,
const BElementwiseOperation b_element_op,
const CDEElementwiseOperation cde_element_op,
const index_t batch_count,
const AGridDesc_AK0_M_AK1 a_grid_desc,
const BGridDesc_BK0_N_BK1 b_grid_desc,
const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock,
const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_,
const Block2CTileMap block_2_ctile_map,
const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch )

◆ kernel_moe_gemm()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_moe_gemm ( typename GridwiseGemm::Argument karg)

◆ kernel_moe_gemm_2lds()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_moe_gemm_2lds ( typename GridwiseGemm::Argument karg)

◆ kernel_moe_mxgemm()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_moe_mxgemm ( typename GridwiseGemm::Argument karg)

◆ kernel_moe_mxgemm_2lds()

template<typename GridwiseGemm, bool HasMainKBlockLoop, InMemoryDataOperationEnum CGlobalMemoryDataOperation, index_t MinimumOccupancy = 1, TailNumber TailNum = TailNumber::Even>
__global__ void ck::kernel_moe_mxgemm_2lds ( typename GridwiseGemm::Argument karg)

◆ kernel_multiblock_batchnorm_forward()

template<typename GridwiseMultiblockBatchNormForward_, typename XDataType, typename YDataType, typename AccDataType, typename ScaleDataType, typename BiasDataType, typename MeanVarDataType, typename YElementwiseOp, typename XYGridDesc_M_K, typename MeanVarCountGridDesc_M_G, typename MeanVarCountGridDesc_M_K, typename ScaleBiasGridDesc_M, typename MeanVarGridDesc_M, typename GetReduceCountPerThreadFunctor>
__global__ void ck::kernel_multiblock_batchnorm_forward ( const XYGridDesc_M_K x_grid_desc_m_k,
const XYGridDesc_M_K y_grid_desc_m_k,
const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g,
const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k,
const ScaleBiasGridDesc_M scale_grid_desc_m,
const ScaleBiasGridDesc_M bias_grid_desc_m,
const MeanVarGridDesc_M mean_var_grid_desc_m,
const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
index_t num_k_block_tile_iteration,
AccDataType epsilon,
const XDataType *const __restrict__ p_x,
MeanVarDataType *const __restrict__ p_welford_mean,
MeanVarDataType *const __restrict__ p_welford_variance,
int32_t *const __restrict__ p_welford_count,
int32_t *const __restrict__ p_control,
const ScaleDataType *const __restrict__ p_scale,
const BiasDataType *const __restrict__ p_bias,
const YElementwiseOp y_elementwise_op,
YDataType *const __restrict__ p_y,
bool updateMovingAverage,
AccDataType averageFactor,
MeanVarDataType *const __restrict__ resultRunningMean,
MeanVarDataType *const __restrict__ resultRunningVariance,
bool saveMeanInvVariance,
MeanVarDataType *const __restrict__ resultSaveMean,
MeanVarDataType *const __restrict__ resultSaveInvVariance )

◆ kernel_multiblock_welford_first_half()

template<typename GridwiseMultiblockWelfordFirstHalf_, typename XDataType, typename MeanVarDataType, typename XGridDesc_M_K, typename MeanVarCountGridDesc_M_G, typename GetReduceCountPerThreadFunctor>
__global__ void ck::kernel_multiblock_welford_first_half ( const XGridDesc_M_K x_grid_desc_m_k,
const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g,
const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
index_t num_k_block_tile_iteration,
const XDataType *const __restrict__ p_x,
MeanVarDataType *const p_welford_mean,
MeanVarDataType *const p_welford_variance,
int32_t *const p_welford_count )

◆ kernel_multiple_buffer_set_value()

template<typename Grid1dBufferDescTuple, index_t NumBuffer, index_t BlockSize, typename DataTypePointerTuple, typename DataTypeTuple>
__global__ void ck::kernel_multiple_buffer_set_value ( const Grid1dBufferDescTuple grid_1d_buffer_desc_tuple,
DataTypePointerTuple p_global_tuple,
DataTypeTuple value_tuple )

◆ kernel_multiple_reduce_multiblock()

template<typename GridwiseMultipleReduction, index_t NumReduction, typename InDataType, typename OutDataTypePointerTuple, typename AccDataType, typename InGridDesc_M_K, typename OutGridDesc_M_Tuple, typename InElementwiseOperationTuple, typename AccElementwiseOperationTuple>
__global__ void ck::kernel_multiple_reduce_multiblock ( const InGridDesc_M_K in_grid_desc_m_k,
const OutGridDesc_M_Tuple out_grid_desc_m_tuple,
const InElementwiseOperationTuple in_elementwise_op_tuple,
const AccElementwiseOperationTuple acc_elementwise_op_tuple,
index_t block_group_size,
index_t num_k_block_tile_iteration,
Array< AccDataType, NumReduction > alpha_values,
const InDataType *const __restrict__ p_in_value_global,
Array< AccDataType, NumReduction > beta_values,
OutDataTypePointerTuple p_out_value_global_tuple )

◆ kernel_multiple_reduce_threadwise()

template<typename GridwiseMultipleReduction, index_t NumReduction, typename InDataType, typename OutDataTypePointerTuple, typename AccDataType, typename InGridDesc_M_K, typename OutGridDesc_M_Tuple, typename InElementwiseOperationTuple, typename AccElementwiseOperationTuple>
__global__ void ck::kernel_multiple_reduce_threadwise ( const InGridDesc_M_K in_grid_desc_m_k,
const OutGridDesc_M_Tuple out_grid_desc_m_tuple,
const InElementwiseOperationTuple in_elementwise_op_tuple,
const AccElementwiseOperationTuple acc_elementwise_op_tuple,
Array< AccDataType, NumReduction > alpha_values,
const InDataType *const __restrict__ p_in_value_global,
Array< AccDataType, NumReduction > beta_values,
OutDataTypePointerTuple p_out_value_global_tuple )

◆ kernel_nd_permute()

template<typename GridwisePermute, typename InGridDesc, typename OutGridDesc, typename InDataType, typename OutDataType, typename ElementwiseOperation, typename Block2TileMap>
__global__ void ck::kernel_nd_permute ( const InGridDesc in_grid_desc,
const OutGridDesc out_grid_desc,
const InDataType * p_in_global,
OutDataType * p_out_global,
const ElementwiseOperation elementwise_op,
const Block2TileMap block_2_tile_map )

◆ kernel_normalization()

template<typename GridwiseReduction, typename XDataType, typename GammaDataType, typename BetaDataType, typename YDataType, typename SaveMeanInvStdDataType, typename ComputeDataType, typename YElementwiseOperation, typename GridDesc_M_K, typename GridDesc_M>
__global__ void ck::kernel_normalization ( const GridDesc_M_K x_grid_desc_m_k,
const GridDesc_M_K gamma_grid_desc_m_k,
const GridDesc_M_K beta_grid_desc_m_k,
const GridDesc_M_K y_grid_desc_m_k,
const GridDesc_M save_mean_grid_desc_m,
const GridDesc_M save_inv_std_grid_desc_m,
index_t num_k_block_tile_iteration,
ComputeDataType epsilon,
const XDataType *const __restrict__ p_x_global,
const GammaDataType *const __restrict__ p_gamma_global,
const BetaDataType *const __restrict__ p_beta_global,
YDataType *const __restrict__ p_y_global,
SaveMeanInvStdDataType *const __restrict__ p_save_mean_global,
SaveMeanInvStdDataType *const __restrict__ p_save_inv_std_global,
const YElementwiseOperation y_elementwise_op )

◆ kernel_normalizationSplitK1st()

template<typename GridwiseWelford, typename XDataType, typename WorkspaceMeanVarDataType, typename ComputeDataType, typename XGridDesc_M_K, typename MeanVarGridDesc_M_KBlock>
__global__ void ck::kernel_normalizationSplitK1st ( const XGridDesc_M_K x_grid_desc_m_k,
const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock,
index_t num_k_block_tile_iteration,
const XDataType *const __restrict__ p_x_global,
WorkspaceMeanVarDataType *const __restrict__ p_welford_mean,
WorkspaceMeanVarDataType *const __restrict__ p_welford_variance,
int32_t *const __restrict__ p_welford_count )

◆ kernel_normalizationSplitK2nd()

template<typename GridwiseWelfordNormalization, typename WorkspaceMeanVarDataType, typename XDataType, typename GammaDataType, typename BetaDataType, typename YDataType, typename SaveMeanInvStdDataType, typename ComputeDataType, typename YElementwiseOperation, typename MeanVarGridDesc_M_KBlock, typename CountGridDesc_M_KBlock, typename XYGammaBetaGridDesc_M_K, typename SaveMeanInvStdGridDesc_M>
__global__ void ck::kernel_normalizationSplitK2nd ( const MeanVarGridDesc_M_KBlock mean_var_grid_desc_m_kblock,
const CountGridDesc_M_KBlock count_grid_desc_m_kblock,
const XYGammaBetaGridDesc_M_K x_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K gamma_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K beta_grid_desc_m_k,
const XYGammaBetaGridDesc_M_K y_grid_desc_m_k,
const SaveMeanInvStdGridDesc_M save_mean_grid_desc_m,
const SaveMeanInvStdGridDesc_M save_inv_std_grid_desc_m,
index_t num_k_mean_var_count_iteration,
index_t num_k_block_tile_iteration,
index_t k_grid_size,
ComputeDataType epsilon,
const WorkspaceMeanVarDataType *const p_mean_global,
const WorkspaceMeanVarDataType *const p_variance_global,
const int32_t *const p_welford_count_global,
const XDataType *const __restrict__ p_x_global,
const GammaDataType *const __restrict__ p_gamma_global,
const BetaDataType *const __restrict__ p_beta_global,
YDataType *const __restrict__ p_y_global,
SaveMeanInvStdDataType *const __restrict__ p_save_mean_global,
SaveMeanInvStdDataType *const __restrict__ p_save_inv_std_global,
const YElementwiseOperation y_elementwise_op )

◆ kernel_put_element_1d()

template<typename GridwisePutElementwise1dFunctor, typename InGrid1dDesc, typename InDataType, typename IndexDataType, typename OutDataType, typename ElementwiseOperation>
__global__ void ck::kernel_put_element_1d ( const InGrid1dDesc in_grid_1d_desc,
const InDataType *__restrict__ p_in_global,
const IndexDataType *__restrict__ p_indices_global,
OutDataType *__restrict__ p_out_global,
const ElementwiseOperation elementwise_op )

◆ kernel_reduce_multiblock()

template<typename GridwiseReduction, bool OutputIndex, bool HaveIndexInput, typename InDataType, typename OutDataType, typename AccDataType, typename IndexDataType, typename InGridDesc_M_K, typename OutGridDesc_M, typename InElementwiseOperation, typename AccElementwiseOperation>
__global__ void ck::kernel_reduce_multiblock ( const InGridDesc_M_K in_grid_desc_m_k,
const OutGridDesc_M out_grid_desc_m,
const InElementwiseOperation in_elementwise_op,
const AccElementwiseOperation acc_elementwise_op,
index_t block_group_size,
index_t num_k_block_tile_iteration,
AccDataType alpha,
const InDataType *const __restrict__ p_in_value_global,
const IndexDataType *const __restrict__ p_in_index_global,
AccDataType beta,
OutDataType *const __restrict__ p_out_value_global,
IndexDataType *const __restrict__ p_out_index_global )

◆ kernel_reduce_second_half_batchnorm_backward_final()

template<typename GridwiseReduceSecondHalfBatchNormBackwardFinal_, typename XDataType, typename DyDataType, typename DxDataType, typename ScaleDataType, typename DscaleDbiasDataType, typename MeanVarDataType, typename DyElementwiseOp, typename XYGridDesc_M_K, typename DscaleDbiasGridDesc_M_K, typename MeanVarGridDesc_M, typename ScaleBiasGridDesc_M>
__global__ void ck::kernel_reduce_second_half_batchnorm_backward_final ( const XYGridDesc_M_K x_grid_desc_m_k,
const XYGridDesc_M_K dy_grid_desc_m_k,
const XYGridDesc_M_K dx_grid_desc_m_k,
const DscaleDbiasGridDesc_M_K dscale_dbias_grid_desc_m_k,
const MeanVarGridDesc_M mean_var_grid_desc_m,
const ScaleBiasGridDesc_M scale_grid_desc_m,
const ScaleBiasGridDesc_M bias_grid_desc_m,
index_t blkgroup_size,
long_index_t reduce_size,
index_t num_xy_k_block_tile_iteration,
index_t num_dscale_dbias_k_block_tile_iteration,
const DscaleDbiasDataType *const __restrict__ p_reduce_dscale,
const DscaleDbiasDataType *const __restrict__ p_reduce_dbias,
const MeanVarDataType *const __restrict__ p_mean,
const MeanVarDataType *const __restrict__ p_inv_var,
const XDataType *const __restrict__ p_x,
const DyDataType *const __restrict__ p_dy,
const ScaleDataType *const __restrict__ p_scale,
const DyElementwiseOp dy_elementwise_op,
DxDataType *const __restrict__ p_dx,
DscaleDbiasDataType *const __restrict__ p_dscale,
DscaleDbiasDataType *const __restrict__ p_dbias )

◆ kernel_reduce_threadwise()

template<typename GridwiseReduction, bool OutputIndex, bool TransformIndexKtoGlobal, bool HaveIndexInput, typename InDataType, typename OutDataType, typename AccDataType, typename IndexDataType, typename InGridDesc_M_K, typename OutGridDesc_M, typename InElementwiseOperation, typename AccElementwiseOperation>
__global__ void ck::kernel_reduce_threadwise ( const InGridDesc_M_K in_grid_desc_m_k,
const OutGridDesc_M out_grid_desc_m,
const InElementwiseOperation in_elementwise_op,
const AccElementwiseOperation acc_elementwise_op,
AccDataType alpha,
const InDataType *const __restrict__ p_in_value_global,
const IndexDataType *const __restrict__ p_in_index_global,
AccDataType beta,
OutDataType *const __restrict__ p_out_value_global,
IndexDataType *const __restrict__ p_out_index_global )

◆ kernel_reduce_threadwise_multi_d()

template<typename GridwiseReduction, typename InDataType, typename OutDataType, typename AccDataType, typename InGridDesc_M_K, typename DsGridDesc_M, typename OutGridDesc_M, typename InElementwiseOperation, typename OutElementwiseOperation, typename DsGridPointer>
__global__ void ck::kernel_reduce_threadwise_multi_d ( const InGridDesc_M_K in_grid_desc_m_k,
const DsGridDesc_M ds_grid_desc_m,
const OutGridDesc_M out_grid_desc_m,
const InElementwiseOperation in_elementwise_op,
const OutElementwiseOperation out_elementwise_op,
const InDataType *const __restrict__ p_in_value_global,
const DsGridPointer p_ds_value_global,
OutDataType *const __restrict__ p_out_value_global )

◆ kernel_softmax()

template<typename GridwiseReduction, typename InDataType, typename OutDataType, typename AccDataType, typename GridDesc_M_K>
__global__ void ck::kernel_softmax ( const GridDesc_M_K in_grid_desc_m_k,
const GridDesc_M_K out_grid_desc_m_k,
index_t block_group_size,
index_t num_k_block_tile_iteration,
AccDataType alpha,
const InDataType *const __restrict__ p_in_value_global,
AccDataType beta,
OutDataType *const __restrict__ p_out_value_global )

◆ kernel_sparse_embeddings_forward_layernorm()

template<typename GridwiseSparseEmbedding, typename EmbType, typename IndexType, typename GammaDataType, typename BetaDataType, typename AccDataType, typename OutType, typename OutGridDesc, typename EmbElementwiseOperation, ck::index_t NumEmbeddings>
__global__ void ck::kernel_sparse_embeddings_forward_layernorm ( OutType * p_out,
const ck::Array< EmbType *, NumEmbeddings > p_embs,
const ck::Array< IndexType *, NumEmbeddings > p_indexes,
const GammaDataType * p_gamma,
const BetaDataType * p_beta,
const OutGridDesc out_grid_desc,
const AccDataType epsilon,
const EmbElementwiseOperation emb_elementwise_op )

◆ kernel_tensor_rearrange()

template<typename InputGridDesc, typename InputDataType, typename OutputGridDesc, typename OutputDataType, typename Block2ETileMap, typename ComputePtrOffsetOfStridedBatch, typename GridwiseTensorRearrangeKernel>
__global__ void ck::kernel_tensor_rearrange ( const InputGridDesc in_grid_desc,
const InputDataType *__restrict__ p_in_global,
const OutputGridDesc out_grid_desc,
OutputDataType *__restrict__ p_out_global,
const index_t batch_count,
const Block2ETileMap block_2_tile_map,
const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch )

◆ kernel_welford_layernorm2d_second_half()

template<typename GridwiseWelfordLayernorm, typename EMeanVarDataType, typename HDataType, typename GammaDataType, typename BetaDataType, typename ComputeDataType, typename EHGridDesc_M_N, typename LayernormMeanVarGridDesc_M_NBlock, typename LayernormCountGridDesc_M_NBlock, typename GammaBetaGridDesc_N, typename HElementwiseOperation>
__global__ void ck::kernel_welford_layernorm2d_second_half ( const EMeanVarDataType *__restrict__ p_e_grid,
const EMeanVarDataType *__restrict__ p_in_welford_mean_grid,
const EMeanVarDataType *__restrict__ p_in_welford_var_grid,
const int32_t *__restrict__ p_in_welford_count_grid,
const GammaDataType *__restrict__ p_gamma_grid,
const BetaDataType *__restrict__ p_beta_grid,
HDataType *__restrict__ p_h_grid,
const EHGridDesc_M_N e_grid_desc_m_n,
const EHGridDesc_M_N h_grid_desc_m_n,
const LayernormMeanVarGridDesc_M_NBlock mean_var_grid_desc_m_nblock,
const LayernormCountGridDesc_M_NBlock count_grid_desc_m_nblock,
const GammaBetaGridDesc_N gamma_grid_desc_n,
const GammaBetaGridDesc_N beta_grid_desc_n,
index_t numMeanVarCountBlockTileIteration_N,
index_t NBlockClusterLength,
ComputeDataType epsilon,
HElementwiseOperation h_element_op )

◆ kernel_welford_second_half_batchnorm_forward_final()

template<typename GridwiseWelfordSecondHalfBatchNormForwardFinal_, typename XDataType, typename YDataType, typename AccDataType, typename ScaleDataType, typename BiasDataType, typename MeanVarDataType, typename YElementwiseOp, typename XYGridDesc_M_K, typename MeanVarCountGridDesc_M_K, typename ScaleBiasGridDesc_M, typename MeanVarGridDesc_M>
__global__ void ck::kernel_welford_second_half_batchnorm_forward_final ( const XYGridDesc_M_K x_grid_desc_m_k,
const XYGridDesc_M_K y_grid_desc_m_k,
const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k,
const ScaleBiasGridDesc_M scale_grid_desc_m,
const ScaleBiasGridDesc_M bias_grid_desc_m,
const MeanVarGridDesc_M mean_var_grid_desc_m,
index_t blkgroup_size,
index_t num_xy_k_block_tile_iteration,
AccDataType epsilon,
const MeanVarDataType *const __restrict__ p_in_welford_mean,
const MeanVarDataType *const __restrict__ p_in_welford_variance,
const int32_t *const __restrict__ p_in_welford_count,
const XDataType *const __restrict__ p_x,
const ScaleDataType *const __restrict__ p_scale,
const BiasDataType *const __restrict__ p_bias,
const YElementwiseOp y_elementwise_op,
YDataType *const __restrict__ p_y,
bool updateMovingAverage,
AccDataType averageFactor,
MeanVarDataType *const __restrict__ resultRunningMean,
MeanVarDataType *const __restrict__ resultRunningVariance,
bool saveMeanInvVariance,
MeanVarDataType *const __restrict__ resultSaveMean,
MeanVarDataType *const __restrict__ resultSaveInvVariance )

◆ kernel_welford_second_half_reduce_first_half()

template<typename GridwiseWelfordSecondHalfReduceFirstHalf_, typename XDataType, typename DyDataType, typename AccDataType, typename ScaleDataType, typename DscaleDbiasDataType, typename MeanVarDataType, typename DyElementwiseOp, typename XYGridDesc_M_K, typename MeanVarGridDesc_M, typename MeanVarCountGridDesc_M_K, typename DscaleDbiasGridDesc_M_G>
__global__ void ck::kernel_welford_second_half_reduce_first_half ( const XYGridDesc_M_K x_grid_desc_m_k,
const XYGridDesc_M_K dy_grid_desc_m_k,
const MeanVarGridDesc_M mean_var_grid_desc_m,
const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k,
const DscaleDbiasGridDesc_M_G dscale_dbias_grid_desc_m_g,
index_t blkgroup_size,
index_t num_xy_k_block_tile_iteration,
index_t num_mean_var_count_k_block_tile_iteration,
AccDataType epsilon,
bool haveSavedMeanInvVar,
const MeanVarDataType *const __restrict__ p_savedMean,
const MeanVarDataType *const __restrict__ p_savedInvVar,
const MeanVarDataType *const __restrict__ p_in_welford_mean,
const MeanVarDataType *const __restrict__ p_in_welford_variance,
const int32_t *const __restrict__ p_in_welford_count,
const DyElementwiseOp dy_elementwise_op,
MeanVarDataType *const __restrict__ p_out_welford_mean,
MeanVarDataType *const __restrict__ p_out_welford_inv_variance,
const XDataType *const __restrict__ p_x,
const DyDataType *const __restrict__ p_dy,
DscaleDbiasDataType *const __restrict__ p_reduce_dscale,
DscaleDbiasDataType *const __restrict__ p_reduce_dbias )

◆ llvm_amdgcn_raw_buffer_atomic_add_fp16x2()

__device__ half2_t ck::llvm_amdgcn_raw_buffer_atomic_add_fp16x2 ( half2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_atomic_add_fp32()

__device__ float ck::llvm_amdgcn_raw_buffer_atomic_add_fp32 ( float vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_atomic_add_i32()

__device__ int32_t ck::llvm_amdgcn_raw_buffer_atomic_add_i32 ( int32_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_atomic_max_fp64()

__device__ double ck::llvm_amdgcn_raw_buffer_atomic_max_fp64 ( double vdata,
int32x4_t rsrc,
int voffset,
int soffset,
int glc_slc )

◆ llvm_amdgcn_raw_buffer_load_fp16()

__device__ half_t ck::llvm_amdgcn_raw_buffer_load_fp16 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_fp16x2()

__device__ half2_t ck::llvm_amdgcn_raw_buffer_load_fp16x2 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_fp16x4()

__device__ half4_t ck::llvm_amdgcn_raw_buffer_load_fp16x4 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_fp32()

__device__ float ck::llvm_amdgcn_raw_buffer_load_fp32 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_fp32x2()

__device__ float2_t ck::llvm_amdgcn_raw_buffer_load_fp32x2 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_fp32x4()

__device__ float4_t ck::llvm_amdgcn_raw_buffer_load_fp32x4 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i16()

__device__ bhalf_t ck::llvm_amdgcn_raw_buffer_load_i16 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i16x2()

__device__ bhalf2_t ck::llvm_amdgcn_raw_buffer_load_i16x2 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i16x4()

__device__ bhalf4_t ck::llvm_amdgcn_raw_buffer_load_i16x4 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i32()

__device__ int32_t ck::llvm_amdgcn_raw_buffer_load_i32 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i32x2()

__device__ int32x2_t ck::llvm_amdgcn_raw_buffer_load_i32x2 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i32x4()

__device__ int32x4_t ck::llvm_amdgcn_raw_buffer_load_i32x4 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i8()

__device__ int8_t ck::llvm_amdgcn_raw_buffer_load_i8 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i8x2()

__device__ int8x2_t ck::llvm_amdgcn_raw_buffer_load_i8x2 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_i8x4()

__device__ int8x4_t ck::llvm_amdgcn_raw_buffer_load_i8x4 ( int32x4_t srsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_load_lds()

__device__ void ck::llvm_amdgcn_raw_buffer_load_lds ( int32x4_t rsrc,
uint32_t * lds_ptr,
index_t size,
index_t voffset,
index_t soffset,
index_t offset,
index_t aux )

◆ llvm_amdgcn_raw_buffer_store_fp16()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp16 ( half_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_fp16x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp16x2 ( half2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_fp16x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp16x4 ( half4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_fp32()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp32 ( float vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_fp32x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp32x2 ( float2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_fp32x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_fp32x4 ( float4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i16()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i16 ( bhalf_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i16x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i16x2 ( bhalf2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i16x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i16x4 ( bhalf4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i32()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i32 ( int32_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i32x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i32x2 ( int32x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i32x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i32x4 ( int32x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i8()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i8 ( int8_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i8x2()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i8x2 ( int8x2_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ llvm_amdgcn_raw_buffer_store_i8x4()

__device__ void ck::llvm_amdgcn_raw_buffer_store_i8x4 ( int8x4_t vdata,
int32x4_t rsrc,
index_t voffset,
index_t soffset,
index_t glc_slc )

◆ make_array() [1/2]

template<typename X>
__host__ __device__ constexpr auto ck::make_array ( )
constexpr

◆ make_array() [2/2]

template<typename X, typename... Xs>
__host__ __device__ constexpr auto ck::make_array ( X && x,
Xs &&... xs )
constexpr

◆ make_cluster_descriptor()

template<typename Lengths, typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
__host__ __device__ constexpr auto ck::make_cluster_descriptor ( const Lengths & lengths,
ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{} )
constexpr

◆ make_conv_bwd_data_out_transform()

__host__ __device__ constexpr auto ck::make_conv_bwd_data_out_transform ( index_t N,
index_t Ho,
index_t Wo,
index_t K,
index_t YDot,
index_t XDot,
index_t HTilde,
index_t WTilde,
index_t ConvDilationH,
index_t ConvDilationW,
index_t HTildeSlice,
index_t WTildeSlice,
index_t YDotSlice,
index_t XDotSlice,
index_t IHTildeSliceBegin,
index_t IWTildeSliceBegin,
index_t GcdStrideDilationH,
index_t GcdStrideDilationW,
index_t K0,
index_t K1,
index_t MPerBlock,
index_t GemmKPerBlock )
constexpr

◆ make_default_loop_scheduler()

LoopScheduler ck::make_default_loop_scheduler ( )
constexpr

◆ make_dynamic_buffer() [1/2]

template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T, typename ElementSpaceSize>
__host__ __device__ constexpr auto ck::make_dynamic_buffer ( T * p,
ElementSpaceSize element_space_size )
constexpr

◆ make_dynamic_buffer() [2/2]

template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T, typename ElementSpaceSize, typename X, typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X > >::value, bool >::type = false>
__host__ __device__ constexpr auto ck::make_dynamic_buffer ( T * p,
ElementSpaceSize element_space_size,
X invalid_element_value )
constexpr

◆ make_embed_transform()

template<typename UpLengths, typename Coefficients, typename enable_if< UpLengths::Size()==Coefficients::Size(), bool >::type = false>
__host__ __device__ constexpr auto ck::make_embed_transform ( const UpLengths & up_lengths,
const Coefficients & coefficients )
constexpr

◆ make_freeze_transform()

template<typename LowerIndex>
__host__ __device__ constexpr auto ck::make_freeze_transform ( const LowerIndex & low_idx)
constexpr

◆ make_insert_transform()

template<typename UpperIndex>
__host__ __device__ constexpr auto ck::make_insert_transform ( const UpperIndex & up_idx)
constexpr

◆ make_left_pad_transform()

template<typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
__host__ __device__ constexpr auto ck::make_left_pad_transform ( const LowLength & low_length,
const LeftPadLength & left_pad,
integral_constant< bool, SkipIsValidCheck > = integral_constant<bool, false>{} )
constexpr

◆ make_long_dynamic_buffer()

template<AddressSpaceEnum BufferAddressSpace, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence, typename T, typename ElementSpaceSize>
__host__ __device__ constexpr auto ck::make_long_dynamic_buffer ( T * p,
ElementSpaceSize element_space_size )
constexpr

◆ make_merge_transform()

template<typename LowLengths>
__host__ __device__ constexpr auto ck::make_merge_transform ( const LowLengths & low_lengths)
constexpr

◆ make_merge_transform_v1_carry_check()

template<typename LowLengths>
__host__ __device__ constexpr auto ck::make_merge_transform_v1_carry_check ( const LowLengths & low_lengths)
constexpr

◆ make_merge_transform_v2_magic_division()

template<typename LowLengths>
__host__ __device__ constexpr auto ck::make_merge_transform_v2_magic_division ( const LowLengths & low_lengths)
constexpr

◆ make_merge_transform_v3_division_mod()

template<typename LowLengths>
__host__ __device__ constexpr auto ck::make_merge_transform_v3_division_mod ( const LowLengths & low_lengths)
constexpr

◆ make_merge_transform_v4_no_carry()

template<typename LowLengths>
__host__ __device__ constexpr auto ck::make_merge_transform_v4_no_carry ( const LowLengths & low_lengths)
constexpr

◆ make_modulo_transform()

template<typename Modulus, typename UpLength>
__host__ __device__ constexpr auto ck::make_modulo_transform ( const Modulus & modulus,
const UpLength & up_length )
constexpr

◆ make_multi_index()

template<typename... Xs>
__host__ __device__ constexpr auto ck::make_multi_index ( Xs &&... xs)
constexpr

◆ make_naive_tensor_descriptor()

template<typename... Lengths, typename... Strides, typename enable_if< sizeof...(Lengths)==sizeof...(Strides), bool >::type = false>
__host__ __device__ constexpr auto ck::make_naive_tensor_descriptor ( const Tuple< Lengths... > & lengths,
const Tuple< Strides... > & strides )
constexpr

◆ make_naive_tensor_descriptor_aligned()

template<typename... Lengths, typename Align>
__host__ __device__ constexpr auto ck::make_naive_tensor_descriptor_aligned ( const Tuple< Lengths... > & lengths,
Align align )
constexpr

◆ make_naive_tensor_descriptor_packed()

template<typename... Lengths>
__host__ __device__ constexpr auto ck::make_naive_tensor_descriptor_packed ( const Tuple< Lengths... > & lengths)
constexpr

◆ make_pad_transform()

template<typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false>
__host__ __device__ constexpr auto ck::make_pad_transform ( const LowLength & low_length,
const LeftPad & left_pad,
const RightPad & right_pad,
integral_constant< bool, SkipIsValidCheck > = integral_constant<bool, false>{} )
constexpr

◆ make_pass_through_transform()

template<typename LowLength>
__host__ __device__ constexpr auto ck::make_pass_through_transform ( const LowLength & low_length)
constexpr

◆ make_right_pad_transform()

template<typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false>
__host__ __device__ constexpr auto ck::make_right_pad_transform ( const LowLength & low_length,
const RightPadLength & right_pad,
integral_constant< bool, SkipIsValidCheck > = integral_constant<bool, false>{} )
constexpr

◆ make_sequence()

template<index_t... Is>
__host__ __device__ constexpr auto ck::make_sequence ( Number< Is > ...)
constexpr

◆ make_single_stage_tensor_adaptor()

template<typename Transforms, typename LowerDimensionOldTopIdss, typename UpperDimensionNewTopIdss>
__host__ __device__ constexpr auto ck::make_single_stage_tensor_adaptor ( const Transforms & transforms,
LowerDimensionOldTopIdss ,
UpperDimensionNewTopIdss  )
constexpr

◆ make_slice_transform()

template<typename LowLength, typename SliceBegin, typename SliceEnd>
__host__ __device__ constexpr auto ck::make_slice_transform ( const LowLength & low_length,
const SliceBegin & slice_begin,
const SliceEnd & slice_end )
constexpr

◆ make_static_buffer() [1/2]

template<AddressSpaceEnum AddressSpace, typename T, long_index_t N>
__host__ __device__ constexpr auto ck::make_static_buffer ( LongNumber< N > )
constexpr

◆ make_static_buffer() [2/2]

template<AddressSpaceEnum AddressSpace, typename T, index_t N>
__host__ __device__ constexpr auto ck::make_static_buffer ( Number< N > )
constexpr

◆ make_static_tensor() [1/2]

template<AddressSpaceEnum AddressSpace, typename T, typename TensorDesc, typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false>
__host__ __device__ constexpr auto ck::make_static_tensor ( TensorDesc )
constexpr

◆ make_static_tensor() [2/2]

template<AddressSpaceEnum AddressSpace, typename T, typename TensorDesc, typename X, typename enable_if< TensorDesc::IsKnownAtCompileTime(), bool >::type = false, typename enable_if< is_same< remove_cvref_t< T >, remove_cvref_t< X > >::value, bool >::type = false>
__host__ __device__ constexpr auto ck::make_static_tensor ( TensorDesc ,
X invalid_element_value )
constexpr

◆ make_statically_indexed_array() [1/2]

template<typename X>
__host__ __device__ constexpr auto ck::make_statically_indexed_array ( )
constexpr

◆ make_statically_indexed_array() [2/2]

template<typename X, typename... Xs>
__host__ __device__ constexpr auto ck::make_statically_indexed_array ( const X & x,
const Xs &... xs )
constexpr

◆ make_tensor_coordinate()

template<typename TensorDesc, typename VisibleIndex>
__host__ __device__ constexpr auto ck::make_tensor_coordinate ( const TensorDesc & tensor_desc,
const VisibleIndex & idx_visible )
constexpr

◆ make_tensor_coordinate_step() [1/2]

template<typename TensorDesc, typename VisibleIndex>
__host__ __device__ constexpr auto ck::make_tensor_coordinate_step ( const TensorDesc & ,
const VisibleIndex & idx_diff_visible )
constexpr

◆ make_tensor_coordinate_step() [2/2]

template<typename TensorDesc, typename VisibleIndex, typename UpdateLowerIndexHack>
__host__ __device__ constexpr auto ck::make_tensor_coordinate_step ( const TensorDesc & ,
const VisibleIndex & idx_diff_visible,
UpdateLowerIndexHack  )
constexpr

◆ make_tuple()

template<typename... Xs>
__host__ __device__ constexpr auto ck::make_tuple ( Xs &&... xs)
constexpr

◆ make_unmerge_transform()

template<typename UpLengths, bool Use24BitIntegerCalculation = false>
__host__ __device__ constexpr auto ck::make_unmerge_transform ( const UpLengths & up_lengths,
integral_constant< bool, Use24BitIntegerCalculation > = integral_constant<bool, false>{} )
constexpr

◆ make_vector_type()

template<typename T, index_t N>
__host__ __device__ constexpr auto ck::make_vector_type ( Number< N > )
constexpr

◆ make_vectorize_transform()

template<typename VectorSize, typename UpLength>
__host__ __device__ constexpr auto ck::make_vectorize_transform ( const VectorSize & vector_size,
const UpLength & up_length )
constexpr

◆ make_wave_buffer_resource()

template<typename T>
__device__ int32x4_t ck::make_wave_buffer_resource ( T * p_wave,
index_t element_space_size )

◆ make_wave_buffer_resource_new()

template<typename T>
__device__ __amdgpu_buffer_rsrc_t ck::make_wave_buffer_resource_new ( T * p_wave,
index_t element_space_size )

◆ make_wave_buffer_resource_with_default_range()

template<typename T>
__device__ int32x4_t ck::make_wave_buffer_resource_with_default_range ( T * p_wave)

◆ make_wave_buffer_resource_with_default_range_new()

template<typename T>
__device__ __amdgpu_buffer_rsrc_t ck::make_wave_buffer_resource_with_default_range_new ( T * p_wave)

◆ make_xor_transform()

template<typename LowLengths>
__host__ __device__ constexpr auto ck::make_xor_transform ( const LowLengths & low_lengths)
constexpr

◆ make_xor_with_modulo_transform()

template<typename LowLengths>
__host__ __device__ constexpr auto ck::make_xor_with_modulo_transform ( const LowLengths & low_lengths)
constexpr

◆ make_zero_multi_index()

template<index_t NSize>
__host__ __device__ constexpr auto ck::make_zero_multi_index ( )
constexpr

◆ merge_sequences()

template<typename... Seqs>
__host__ __device__ constexpr auto ck::merge_sequences ( Seqs... )
constexpr

◆ modify_sequence_elements_by_ids()

template<typename Seq, typename Values, typename Ids>
__host__ __device__ constexpr auto ck::modify_sequence_elements_by_ids ( Seq ,
Values ,
Ids  )
constexpr

◆ move_tensor_coordinate()

template<typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
__host__ __device__ constexpr void ck::move_tensor_coordinate ( const TensorDesc & tensor_desc,
TensorCoord & coord,
const TensorCoordStep & coord_step )
constexpr

◆ mxf8_convert_rne()

template<typename Y, typename X>
__host__ __device__ constexpr Y ck::mxf8_convert_rne ( X x,
float scale )
constexpr

◆ mxf8_convert_rne< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::mxf8_convert_rne< bf8_ocp_t, float > ( float x,
float scale )
inline

◆ mxf8_convert_rne< bf8x16_ocp_t, float16_t >()

template<>
__host__ __device__ bf8x16_ocp_t ck::mxf8_convert_rne< bf8x16_ocp_t, float16_t > ( float16_t x,
float scale )
inline

◆ mxf8_convert_rne< bf8x2_ocp_t, float2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::mxf8_convert_rne< bf8x2_ocp_t, float2_t > ( float2_t x,
float scale )
inline

◆ mxf8_convert_rne< bf8x32_ocp_t, float32_t >()

template<>
__host__ __device__ bf8x32_ocp_t ck::mxf8_convert_rne< bf8x32_ocp_t, float32_t > ( float32_t x,
float scale )
inline

◆ mxf8_convert_rne< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::mxf8_convert_rne< f8_ocp_t, float > ( float x,
float scale )
inline

◆ mxf8_convert_rne< f8x16_ocp_t, float16_t >()

template<>
__host__ __device__ f8x16_ocp_t ck::mxf8_convert_rne< f8x16_ocp_t, float16_t > ( float16_t x,
float scale )
inline

◆ mxf8_convert_rne< f8x2_ocp_t, float2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::mxf8_convert_rne< f8x2_ocp_t, float2_t > ( float2_t x,
float scale )
inline

◆ mxf8_convert_rne< f8x32_ocp_t, float32_t >()

template<>
__host__ __device__ f8x32_ocp_t ck::mxf8_convert_rne< f8x32_ocp_t, float32_t > ( float32_t x,
float scale )
inline

◆ mxf8_convert_sr()

template<typename Y, typename X>
__host__ __device__ constexpr Y ck::mxf8_convert_sr ( X x,
float scale )
constexpr

◆ mxf8_convert_sr< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::mxf8_convert_sr< bf8_ocp_t, float > ( float x,
float scale )
inline

◆ mxf8_convert_sr< bf8x16_ocp_t, float16_t >()

template<>
__host__ __device__ bf8x16_ocp_t ck::mxf8_convert_sr< bf8x16_ocp_t, float16_t > ( float16_t x,
float scale )
inline

◆ mxf8_convert_sr< bf8x2_ocp_t, float2_t >()

template<>
__host__ __device__ bf8x2_ocp_t ck::mxf8_convert_sr< bf8x2_ocp_t, float2_t > ( float2_t x,
float scale )
inline

◆ mxf8_convert_sr< bf8x32_ocp_t, float32_t >()

template<>
__host__ __device__ bf8x32_ocp_t ck::mxf8_convert_sr< bf8x32_ocp_t, float32_t > ( float32_t x,
float scale )
inline

◆ mxf8_convert_sr< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::mxf8_convert_sr< f8_ocp_t, float > ( float x,
float scale )
inline

◆ mxf8_convert_sr< f8x16_ocp_t, float16_t >()

template<>
__host__ __device__ f8x16_ocp_t ck::mxf8_convert_sr< f8x16_ocp_t, float16_t > ( float16_t x,
float scale )
inline

◆ mxf8_convert_sr< f8x2_ocp_t, float2_t >()

template<>
__host__ __device__ f8x2_ocp_t ck::mxf8_convert_sr< f8x2_ocp_t, float2_t > ( float2_t x,
float scale )
inline

◆ mxf8_convert_sr< f8x32_ocp_t, float32_t >()

template<>
__host__ __device__ f8x32_ocp_t ck::mxf8_convert_sr< f8x32_ocp_t, float32_t > ( float32_t x,
float scale )
inline

◆ next_pow2()

auto ck::next_pow2 ( uint32_t x)
inlineconstexpr

◆ NormalizationKernelSelector()

template<typename XDataType, typename GammaDataType, typename BetaDataType, typename YDataType, typename SaveMeanInvStdDataType, typename ComputeDataType, typename YElementwiseOperation, typename GridDesc_M_K, typename GridDesc_M, index_t BlockSize, index_t MThreadClusterSize, index_t KThreadClusterSize, index_t MThreadSliceSize, index_t KThreadSliceSize, index_t XSrcVectorDim, index_t XSrcVectorSize, index_t GammaSrcVectorDim, index_t GammaSrcVectorSize, index_t BetaSrcVectorDim, index_t BetaSrcVectorSize, index_t YDstVectorDim, index_t YDstVectorSize, index_t SaveMeanInvStdDstVectorSize, bool UseWelford>
auto ck::NormalizationKernelSelector ( bool isSweepOnce)

◆ operator%() [1/4]

template<typename TX, TX X, typename TY, TY Y>
__host__ __device__ constexpr auto ck::operator% ( integral_constant< TX, X > ,
integral_constant< TY, Y >  )
constexpr

◆ operator%() [2/4]

template<index_t Y, index_t... Xs>
__host__ __device__ constexpr auto ck::operator% ( Number< Y > ,
Sequence< Xs... >  )
constexpr

◆ operator%() [3/4]

template<index_t... Xs, index_t Y>
__host__ __device__ constexpr auto ck::operator% ( Sequence< Xs... > ,
Number< Y >  )
constexpr

◆ operator%() [4/4]

template<index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto ck::operator% ( Sequence< Xs... > ,
Sequence< Ys... >  )
constexpr

◆ operator*() [1/8]

template<index_t NSize, typename T>
__host__ __device__ constexpr auto ck::operator* ( const MultiIndex< NSize > & a,
const T & b )
constexpr

◆ operator*() [2/8]

template<typename... Xs, typename Y, enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ __device__ constexpr auto ck::operator* ( const Tuple< Xs... > & x,
const Y & y )
constexpr

◆ operator*() [3/8]

template<typename... Xs, typename Y, enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false>
__host__ __device__ constexpr auto ck::operator* ( const Tuple< Xs... > & x,
Y a )
constexpr

◆ operator*() [4/8]

template<typename TX, TX X, typename TY, TY Y>
__host__ __device__ constexpr auto ck::operator* ( integral_constant< TX, X > ,
integral_constant< TY, Y >  )
constexpr

◆ operator*() [5/8]

template<index_t Y, index_t... Xs>
__host__ __device__ constexpr auto ck::operator* ( Number< Y > ,
Sequence< Xs... >  )
constexpr

◆ operator*() [6/8]

template<index_t... Xs, index_t Y>
__host__ __device__ constexpr auto ck::operator* ( Sequence< Xs... > ,
Number< Y >  )
constexpr

◆ operator*() [7/8]

template<index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto ck::operator* ( Sequence< Xs... > ,
Sequence< Ys... >  )
constexpr

◆ operator*() [8/8]

template<typename... Xs, typename Y, enable_if_t< ck::is_integral< Y >::value||ck::is_floating_point< Y >::value, bool > = false>
__host__ __device__ constexpr auto ck::operator* ( Y a,
const Tuple< Xs... > & x )
constexpr

◆ operator+() [1/6]

template<index_t NSize, typename T>
__host__ __device__ constexpr auto ck::operator+ ( const MultiIndex< NSize > & a,
const T & b )
constexpr

◆ operator+() [2/6]

template<typename... Xs, typename Y, enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ __device__ constexpr auto ck::operator+ ( const Tuple< Xs... > & x,
const Y & y )
constexpr

◆ operator+() [3/6]

template<typename TX, TX X, typename TY, TY Y>
__host__ __device__ constexpr auto ck::operator+ ( integral_constant< TX, X > ,
integral_constant< TY, Y >  )
constexpr

◆ operator+() [4/6]

template<index_t Y, index_t... Xs>
__host__ __device__ constexpr auto ck::operator+ ( Number< Y > ,
Sequence< Xs... >  )
constexpr

◆ operator+() [5/6]

template<index_t... Xs, index_t Y>
__host__ __device__ constexpr auto ck::operator+ ( Sequence< Xs... > ,
Number< Y >  )
constexpr

◆ operator+() [6/6]

template<index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto ck::operator+ ( Sequence< Xs... > ,
Sequence< Ys... >  )
constexpr

◆ operator+=() [1/3]

template<typename Arr, typename Picks, typename X>
__host__ __device__ constexpr auto ck::operator+= ( ContainerElementPicker< Arr, Picks > & y,
const X & x )
constexpr

◆ operator+=() [2/3]

template<index_t NSize, typename X>
__host__ __device__ constexpr auto ck::operator+= ( MultiIndex< NSize > & y,
const X & x )
constexpr

◆ operator+=() [3/3]

template<typename... Ys, typename X, enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false>
__host__ __device__ constexpr auto ck::operator+= ( Tuple< Ys... > & y,
const X & x )
constexpr

◆ operator-() [1/6]

template<index_t NSize, typename T>
__host__ __device__ constexpr auto ck::operator- ( const MultiIndex< NSize > & a,
const T & b )
constexpr

◆ operator-() [2/6]

template<typename... Xs, typename Y, enable_if_t<!ck::is_integral< Y >::value &&!ck::is_floating_point< Y >::value, bool > = false>
__host__ __device__ constexpr auto ck::operator- ( const Tuple< Xs... > & x,
const Y & y )
constexpr

◆ operator-() [3/6]

template<typename TX, TX X, typename TY, TY Y>
__host__ __device__ constexpr auto ck::operator- ( integral_constant< TX, X > ,
integral_constant< TY, Y >  )
constexpr

◆ operator-() [4/6]

template<index_t Y, index_t... Xs>
__host__ __device__ constexpr auto ck::operator- ( Number< Y > ,
Sequence< Xs... >  )
constexpr

◆ operator-() [5/6]

template<index_t... Xs, index_t Y>
__host__ __device__ constexpr auto ck::operator- ( Sequence< Xs... > ,
Number< Y >  )
constexpr

◆ operator-() [6/6]

template<index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto ck::operator- ( Sequence< Xs... > ,
Sequence< Ys... >  )
constexpr

◆ operator-=() [1/3]

template<typename Arr, typename Picks, typename X>
__host__ __device__ constexpr auto ck::operator-= ( ContainerElementPicker< Arr, Picks > & y,
const X & x )
constexpr

◆ operator-=() [2/3]

template<index_t NSize, typename X>
__host__ __device__ constexpr auto ck::operator-= ( MultiIndex< NSize > & y,
const X & x )
constexpr

◆ operator-=() [3/3]

template<typename... Ys, typename X, enable_if_t<!ck::is_integral< X >::value &&!ck::is_floating_point< X >::value, bool > = false>
__host__ __device__ constexpr auto ck::operator-= ( Tuple< Ys... > & y,
const X & x )
constexpr

◆ operator/() [1/4]

template<typename TX, TX X, typename TY, TY Y>
__host__ __device__ constexpr auto ck::operator/ ( integral_constant< TX, X > ,
integral_constant< TY, Y >  )
constexpr

◆ operator/() [2/4]

template<index_t Y, index_t... Xs>
__host__ __device__ constexpr auto ck::operator/ ( Number< Y > ,
Sequence< Xs... >  )
constexpr

◆ operator/() [3/4]

template<index_t... Xs, index_t Y>
__host__ __device__ constexpr auto ck::operator/ ( Sequence< Xs... > ,
Number< Y >  )
constexpr

◆ operator/() [4/4]

template<index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto ck::operator/ ( Sequence< Xs... > ,
Sequence< Ys... >  )
constexpr

◆ operator==()

template<index_t... Xs, index_t... Ys>
__host__ __device__ constexpr bool ck::operator== ( Sequence< Xs... > ,
Sequence< Ys... >  )
constexpr

◆ pick_container_element() [1/2]

template<typename Arr, typename Picks>
__host__ __device__ constexpr auto ck::pick_container_element ( Arr & a,
Picks  )
constexpr

◆ pick_container_element() [2/2]

template<typename Arr, typename Picks>
__host__ __device__ constexpr auto ck::pick_container_element ( const Arr & a,
Picks  )
constexpr

◆ pick_sequence_elements_by_ids()

template<typename Seq, index_t... Is>
__host__ __device__ constexpr auto ck::pick_sequence_elements_by_ids ( Seq ,
Sequence< Is... >  )
constexpr

◆ pick_sequence_elements_by_mask()

template<typename Seq, typename Mask>
__host__ __device__ constexpr auto ck::pick_sequence_elements_by_mask ( Seq ,
Mask  )
constexpr

◆ prand_generator() [1/3]

template<typename T, uint32_t seed_t, ck::enable_if_t< is_same< _Float16, T >{}, bool > = false>
__host__ __device__ uint32_t ck::prand_generator ( index_t id,
T val,
uint32_t seed = seed_t )

◆ prand_generator() [2/3]

template<typename T, uint32_t seed_t, ck::enable_if_t< is_same< float, T >{}, bool > = false>
__host__ __device__ uint32_t ck::prand_generator ( index_t id,
T val,
uint32_t seed = seed_t )

◆ prand_generator() [3/3]

template<typename T, uint32_t seed_t, ck::enable_if_t<!(is_same< float, T >{}||is_same< _Float16, T >{}), bool > = false>
__host__ __device__ uint32_t ck::prand_generator ( int id,
T val,
uint32_t seed = seed_t )

◆ print_multi_index()

template<typename... Xs>
__host__ __device__ void ck::print_multi_index ( const Tuple< Xs... > & x)

◆ reduce_on_sequence()

template<typename Seq, typename Reduce, index_t Init>
__host__ __device__ constexpr index_t ck::reduce_on_sequence ( Seq ,
Reduce f,
Number< Init >  )
constexpr

◆ reverse_exclusive_scan_sequence()

template<typename Seq, typename Reduce, index_t Init>
__host__ __device__ constexpr auto ck::reverse_exclusive_scan_sequence ( Seq ,
Reduce ,
Number< Init >  )
constexpr

◆ reverse_inclusive_scan_sequence()

template<typename Seq, typename Reduce, index_t Init>
__host__ __device__ constexpr auto ck::reverse_inclusive_scan_sequence ( Seq ,
Reduce ,
Number< Init >  )
constexpr

◆ s_nop()

__device__ void ck::s_nop ( )

◆ scaled_type_convert()

template<typename Y, typename X>
__host__ constexpr Y ck::scaled_type_convert ( e8m0_bexp_t scale,
X x )
constexpr

◆ scaled_type_convert< bf8_ocp_t, float >()

template<>
__host__ bf8_ocp_t ck::scaled_type_convert< bf8_ocp_t, float > ( e8m0_bexp_t scale,
float x )
inline

◆ scaled_type_convert< bf8x16_ocp_t, float16_t >()

template<>
__host__ bf8x16_ocp_t ck::scaled_type_convert< bf8x16_ocp_t, float16_t > ( e8m0_bexp_t scale,
float16_t x )
inline

◆ scaled_type_convert< bf8x2_ocp_t, float2_t >()

template<>
__host__ bf8x2_ocp_t ck::scaled_type_convert< bf8x2_ocp_t, float2_t > ( e8m0_bexp_t scale,
float2_t x )
inline

◆ scaled_type_convert< bf8x32_ocp_t, float32_t >()

template<>
__host__ bf8x32_ocp_t ck::scaled_type_convert< bf8x32_ocp_t, float32_t > ( e8m0_bexp_t scale,
float32_t x )
inline

◆ scaled_type_convert< f8_ocp_t, float >()

template<>
__host__ f8_ocp_t ck::scaled_type_convert< f8_ocp_t, float > ( e8m0_bexp_t scale,
float x )
inline

◆ scaled_type_convert< f8x16_ocp_t, float16_t >()

template<>
__host__ f8x16_ocp_t ck::scaled_type_convert< f8x16_ocp_t, float16_t > ( e8m0_bexp_t scale,
float16_t x )
inline

◆ scaled_type_convert< f8x2_ocp_t, float2_t >()

template<>
__host__ f8x2_ocp_t ck::scaled_type_convert< f8x2_ocp_t, float2_t > ( e8m0_bexp_t scale,
float2_t x )
inline

◆ scaled_type_convert< f8x32_ocp_t, float32_t >()

template<>
__host__ f8x32_ocp_t ck::scaled_type_convert< f8x32_ocp_t, float32_t > ( e8m0_bexp_t scale,
float32_t x )
inline

◆ scaled_type_convert< float, bf8_ocp_t >()

template<>
__host__ float ck::scaled_type_convert< float, bf8_ocp_t > ( e8m0_bexp_t scale,
bf8_ocp_t x )
inline

◆ scaled_type_convert< float, f8_ocp_t >()

template<>
__host__ float ck::scaled_type_convert< float, f8_ocp_t > ( e8m0_bexp_t scale,
f8_ocp_t x )
inline

◆ scaled_type_convert< float16_t, bf8x16_ocp_t >()

template<>
__host__ float16_t ck::scaled_type_convert< float16_t, bf8x16_ocp_t > ( e8m0_bexp_t scale,
bf8x16_ocp_t x )
inline

◆ scaled_type_convert< float16_t, f8x16_ocp_t >()

template<>
__host__ float16_t ck::scaled_type_convert< float16_t, f8x16_ocp_t > ( e8m0_bexp_t scale,
f8x16_ocp_t x )
inline

◆ scaled_type_convert< float2_t, bf8x2_ocp_t >()

template<>
__host__ float2_t ck::scaled_type_convert< float2_t, bf8x2_ocp_t > ( e8m0_bexp_t scale,
bf8x2_ocp_t x )
inline

◆ scaled_type_convert< float2_t, f8x2_ocp_t >()

template<>
__host__ float2_t ck::scaled_type_convert< float2_t, f8x2_ocp_t > ( e8m0_bexp_t scale,
f8x2_ocp_t x )
inline

◆ scaled_type_convert< float32_t, bf8x32_ocp_t >()

template<>
__host__ float32_t ck::scaled_type_convert< float32_t, bf8x32_ocp_t > ( e8m0_bexp_t scale,
bf8x32_ocp_t x )
inline

◆ scaled_type_convert< float32_t, f8x32_ocp_t >()

template<>
__host__ float32_t ck::scaled_type_convert< float32_t, f8x32_ocp_t > ( e8m0_bexp_t scale,
f8x32_ocp_t x )
inline

◆ sequence_all_of()

template<typename Seq, typename F>
__host__ __device__ constexpr bool ck::sequence_all_of ( Seq ,
F f )
constexpr

◆ sequence_any_of()

template<typename Seq, typename F>
__host__ __device__ constexpr bool ck::sequence_any_of ( Seq ,
F f )
constexpr

◆ sequence_pop_back()

template<typename Seq>
__host__ __device__ constexpr auto ck::sequence_pop_back ( Seq )
constexpr

◆ sequence_pop_front()

template<index_t I, index_t... Is>
__host__ __device__ constexpr auto ck::sequence_pop_front ( Sequence< I, Is... > )
constexpr

◆ sequence_to_tuple_of_number()

template<index_t... Is>
__host__ __device__ constexpr auto ck::sequence_to_tuple_of_number ( Sequence< Is... > )
constexpr

◆ set_container_subset() [1/2]

template<typename T, index_t N, index_t... Is>
__host__ __device__ constexpr void ck::set_container_subset ( Array< T, N > & y,
Sequence< Is... > picks,
const Array< T, sizeof...(Is)> & x )
constexpr

◆ set_container_subset() [2/2]

template<typename... Ys, index_t... Is, typename... Xs>
__host__ __device__ constexpr void ck::set_container_subset ( Tuple< Ys... > & y,
Sequence< Is... > picks,
const Tuple< Xs... > & x )
constexpr

◆ tie()

template<typename... Args>
Tuple< Args &... > ck::tie ( Args &... args)
constexprnoexcept

◆ to_multi_index()

template<typename T>
__host__ __device__ constexpr auto ck::to_multi_index ( const T & x)
constexpr

◆ to_sequence()

template<index_t... Is>
__host__ __device__ constexpr auto ck::to_sequence ( Tuple< Number< Is >... > )
constexpr

◆ transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad()

template<typename... In, typename... Wei, typename... Out, typename ConvStrides, typename ConvDilations, typename InLeftPads, typename InRightPads, index_t GemmK1Value>
__host__ __device__ constexpr auto ck::transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad ( const TensorDescriptor< In... > & in_grid_desc_n_di_hi_wi_c,
const TensorDescriptor< Wei... > & wei_k_z_y_x_c_grid_desc,
const TensorDescriptor< Out... > & out_n_do_ho_wo_k_grid_desc,
const ConvStrides & conv_strides,
const ConvDilations & conv_dilations,
const InLeftPads & in_left_pads,
const InRightPads & in_right_pads,
Number< GemmK1Value >  )
constexpr

◆ transform_sequences() [1/3]

template<typename F, index_t... Xs>
__host__ __device__ constexpr auto ck::transform_sequences ( F f,
Sequence< Xs... >  )
constexpr

◆ transform_sequences() [2/3]

template<typename F, index_t... Xs, index_t... Ys>
__host__ __device__ constexpr auto ck::transform_sequences ( F f,
Sequence< Xs... > ,
Sequence< Ys... >  )
constexpr

◆ transform_sequences() [3/3]

template<typename F, index_t... Xs, index_t... Ys, index_t... Zs>
__host__ __device__ constexpr auto ck::transform_sequences ( F f,
Sequence< Xs... > ,
Sequence< Ys... > ,
Sequence< Zs... >  )
constexpr

◆ transform_tensor_descriptor()

template<typename OldTensorDescriptor, typename NewTransforms, typename NewLowerDimensionOldVisibleIdss, typename NewUpperDimensionNewVisibleIdss>
__host__ __device__ constexpr auto ck::transform_tensor_descriptor ( const OldTensorDescriptor & old_tensor_desc,
const NewTransforms & new_transforms,
NewLowerDimensionOldVisibleIdss ,
NewUpperDimensionNewVisibleIdss  )
constexpr

◆ transform_tuples() [1/3]

template<typename F, typename X>
__host__ __device__ constexpr auto ck::transform_tuples ( F f,
const X & x )
constexpr

◆ transform_tuples() [2/3]

template<typename F, typename X, typename Y>
__host__ __device__ constexpr auto ck::transform_tuples ( F f,
const X & x,
const Y & y )
constexpr

◆ transform_tuples() [3/3]

template<typename F, typename X, typename Y, typename Z>
__host__ __device__ constexpr auto ck::transform_tuples ( F f,
const X & x,
const Y & y,
const Z & z )
constexpr

◆ transpose_f8_4x4()

__device__ void ck::transpose_f8_4x4 ( const f8x4_t & x0,
const f8x4_t & x1,
const f8x4_t & x2,
const f8x4_t & x3,
f8x4_t & y0,
f8x4_t & y1,
f8x4_t & y2,
f8x4_t & y3 )

◆ transpose_fp16_2x2()

__device__ void ck::transpose_fp16_2x2 ( const half2_t & x0,
const half2_t & x1,
half2_t & y0,
half2_t & y1 )

◆ transpose_int8_4x4()

__device__ void ck::transpose_int8_4x4 ( const int8x4_t & x0,
const int8x4_t & x1,
const int8x4_t & x2,
const int8x4_t & x3,
int8x4_t & y0,
int8x4_t & y1,
int8x4_t & y2,
int8x4_t & y3 )

◆ TupleDepth() [1/2]

template<index_t depth = 0, typename T>
__host__ __device__ constexpr auto ck::TupleDepth ( const T & )
constexpr

◆ TupleDepth() [2/2]

template<index_t depth = 0, typename... Ts>
__host__ __device__ constexpr auto ck::TupleDepth ( const Tuple< Ts... > & )
constexpr

◆ TupleReduce()

template<index_t Idx, index_t End, typename F, typename... Ts>
__host__ __device__ constexpr auto ck::TupleReduce ( F && f,
const Tuple< Ts... > & tuple )
constexpr

◆ TupleReverse()

template<typename... Ts>
__host__ __device__ constexpr auto ck::TupleReverse ( const Tuple< Ts... > & tuple)
constexpr

◆ TupleSlice()

template<index_t from, index_t to, typename... Ts>
__host__ __device__ constexpr auto ck::TupleSlice ( const Tuple< Ts... > & tuple)
constexpr

◆ type_convert() [1/3]

template<typename Y, enable_if_t< is_same_v< Y, ck::tf32_t >, bool > = false>
__host__ __device__ constexpr float ck::type_convert ( float x)
inlineconstexpr

◆ type_convert() [2/3]

template<typename Y, typename X, ck::enable_if_t< ck::is_const_v< Y >||ck::is_const_v< X >, bool > = false>
__host__ __device__ constexpr Y ck::type_convert ( X x)
constexpr

◆ type_convert() [3/3]

template<typename Y, typename X, ck::enable_if_t<!(ck::is_const_v< Y >||ck::is_const_v< X >), bool > = false>
__host__ __device__ constexpr Y ck::type_convert ( X x)
constexpr

◆ type_convert< bf6_t, float >()

template<>
__host__ __device__ bf6_t ck::type_convert< bf6_t, float > ( float x)
inline

Specializes float-to-bf6_t conversion.

Uses stochastic rounding if CK_USE_SR_F6_CONVERSION is defined, otherwise uses round-to-nearest-even.

Parameters
xInput float value to convert.
Returns
Converted bf6_t value.

◆ type_convert< bf6x16_pk_t, float16_t >()

template<>
__host__ __device__ bf6x16_pk_t ck::type_convert< bf6x16_pk_t, float16_t > ( float16_t x)
inline

◆ type_convert< bf6x16_t, float16_t >()

template<>
__host__ __device__ bf6x16_t ck::type_convert< bf6x16_t, float16_t > ( float16_t x)
inline

◆ type_convert< bf6x32_pk_t, float32_t >()

template<>
__host__ __device__ bf6x32_pk_t ck::type_convert< bf6x32_pk_t, float32_t > ( float32_t x)
inline

◆ type_convert< bf6x32_t, float32_t >()

template<>
__host__ __device__ bf6x32_t ck::type_convert< bf6x32_t, float32_t > ( float32_t x)
inline

Specializes vector of 32 float-to-bf6_t conversion.

Uses stochastic rounding if CK_USE_SR_F6_CONVERSION is defined, otherwise uses round-to-nearest-even.

Parameters
xInput float vector to convert.
Returns
Converted bf6x32_t vector.

◆ type_convert< bf8_fnuz_t, float >()

template<>
__host__ __device__ bf8_fnuz_t ck::type_convert< bf8_fnuz_t, float > ( float x)
inline

◆ type_convert< bf8_fnuz_t, half_t >()

template<>
__host__ __device__ bf8_fnuz_t ck::type_convert< bf8_fnuz_t, half_t > ( half_t x)
inline

◆ type_convert< bf8_ocp_t, bhalf_t >()

template<>
__host__ __device__ bf8_ocp_t ck::type_convert< bf8_ocp_t, bhalf_t > ( bhalf_t x)
inline

Converts a bhalf_t value to a bf8_ocp_t value with rounding determined by a flag.

Parameters
xThe input bhalf_t value.
Returns
The converted bf8_ocp_t value.

◆ type_convert< bf8_ocp_t, float >()

template<>
__host__ __device__ bf8_ocp_t ck::type_convert< bf8_ocp_t, float > ( float x)
inline

Converts a float value to a bf8_ocp_t value with rounding determined by a flag.

Parameters
xThe input float value.
Returns
The converted bf8_ocp_t value.

◆ type_convert< bf8_ocp_t, half_t >()

template<>
__host__ __device__ bf8_ocp_t ck::type_convert< bf8_ocp_t, half_t > ( half_t x)
inline

Converts a half_t value to a bf8_ocp_t value with rounding determined by a flag.

Parameters
xThe input half_t value.
Returns
The converted bf8_ocp_t value.

◆ type_convert< bf8_ocp_t, int >()

template<>
__host__ __device__ constexpr bf8_ocp_t ck::type_convert< bf8_ocp_t, int > ( int x)
inlineconstexpr

◆ type_convert< bhalf2_t, bf8x2_ocp_t >()

template<>
__host__ __device__ bhalf2_t ck::type_convert< bhalf2_t, bf8x2_ocp_t > ( bf8x2_ocp_t x)
inline

Converts a vector of 2 bf8_ocp_t values to a vector of 2 bhalf_t values.

Parameters
xThe input vector of 2 bf8_ocp_t values.
Returns
The converted vector of 2 bhalf_t values.

◆ type_convert< bhalf2_t, f8x2_ocp_t >()

template<>
__host__ __device__ bhalf2_t ck::type_convert< bhalf2_t, f8x2_ocp_t > ( f8x2_ocp_t x)
inline

Converts a vector of 2 f8_ocp_t values to a vector of 2 bhalf_t values.

Parameters
xThe input vector of 2 f8_ocp_t values.
Returns
The converted vector of 2 bhalf_t values.

◆ type_convert< bhalf2_t, pk_i4_t >()

template<>
__host__ __device__ bhalf2_t ck::type_convert< bhalf2_t, pk_i4_t > ( pk_i4_t x)
inline

◆ type_convert< bhalf_t, bf8_ocp_t >()

template<>
__host__ __device__ bhalf_t ck::type_convert< bhalf_t, bf8_ocp_t > ( bf8_ocp_t x)
inline

Converts a bf8_ocp_t value to a bhalf_t value.

Parameters
xThe input bf8_ocp_t value.
Returns
The converted bhalf_t value.

◆ type_convert< bhalf_t, f8_ocp_t >()

template<>
__host__ __device__ bhalf_t ck::type_convert< bhalf_t, f8_ocp_t > ( f8_ocp_t x)
inline

Converts a f8_ocp_t value to a bhalf_t value.

Parameters
xThe input f8_ocp_t value.
Returns
The converted bhalf_t value.

◆ type_convert< bhalf_t, float >()

template<>
__host__ __device__ constexpr bhalf_t ck::type_convert< bhalf_t, float > ( float x)
inlineconstexpr

◆ type_convert< bhalf_t, half_t >()

template<>
__host__ __device__ constexpr bhalf_t ck::type_convert< bhalf_t, half_t > ( half_t x)
inlineconstexpr

◆ type_convert< bhalf_t, int8_t >()

template<>
__host__ __device__ constexpr bhalf_t ck::type_convert< bhalf_t, int8_t > ( int8_t x)
inlineconstexpr

◆ type_convert< f4_t, float >()

template<>
__host__ __device__ f4_t ck::type_convert< f4_t, float > ( float x)
inline

◆ type_convert< f4x2_pk_t, float2_t >()

template<>
__host__ __device__ f4x2_pk_t ck::type_convert< f4x2_pk_t, float2_t > ( float2_t x)
inline

◆ type_convert< f4x2_t, float2_t >()

template<>
__host__ __device__ f4x2_t ck::type_convert< f4x2_t, float2_t > ( float2_t x)
inline

◆ type_convert< f4x32_t, float32_t >()

template<>
__host__ __device__ f4x32_t ck::type_convert< f4x32_t, float32_t > ( float32_t x)
inline

◆ type_convert< f6_t, float >()

template<>
__host__ __device__ f6_t ck::type_convert< f6_t, float > ( float x)
inline

Specializes the type conversion template for converting a float into the 6-bit float type (f6_t).

Depending on the CK_USE_SR_F6_CONVERSION flag, the conversion uses stochastic rounding or round-to-nearest-even.

Parameters
xInput float value to be converted.
Returns
The converted f6_t value.

◆ type_convert< f6x16_pk_t, float16_t >()

template<>
__host__ __device__ f6x16_pk_t ck::type_convert< f6x16_pk_t, float16_t > ( float16_t x)
inline

◆ type_convert< f6x16_t, float16_t >()

template<>
__host__ __device__ f6x16_t ck::type_convert< f6x16_t, float16_t > ( float16_t x)
inline

◆ type_convert< f6x32_pk_t, float32_t >()

template<>
__host__ __device__ f6x32_pk_t ck::type_convert< f6x32_pk_t, float32_t > ( float32_t x)
inline

◆ type_convert< f6x32_t, float32_t >()

template<>
__host__ __device__ f6x32_t ck::type_convert< f6x32_t, float32_t > ( float32_t x)
inline

Specializes the type conversion template for converting a vector of 32 floats into the vector of 32 6-bit float types (f6x32_t).

Depending on the CK_USE_SR_F6_CONVERSION flag, the conversion uses stochastic rounding or round-to-nearest-even.

Parameters
xInput float value to be converted.
Returns
The converted f6x32_t vector.

◆ type_convert< f8_fnuz_t, float >()

template<>
__host__ __device__ f8_fnuz_t ck::type_convert< f8_fnuz_t, float > ( float x)
inline

◆ type_convert< f8_fnuz_t, half_t >()

template<>
__host__ __device__ f8_fnuz_t ck::type_convert< f8_fnuz_t, half_t > ( half_t x)
inline

◆ type_convert< f8_ocp_t, bhalf_t >()

template<>
__host__ __device__ f8_ocp_t ck::type_convert< f8_ocp_t, bhalf_t > ( bhalf_t x)
inline

Converts a bhalf_t value to a f8_ocp_t value with rounding determined by a flag.

Parameters
xThe input bhalf_t value.
Returns
The converted f8_ocp_t value.

◆ type_convert< f8_ocp_t, float >()

template<>
__host__ __device__ f8_ocp_t ck::type_convert< f8_ocp_t, float > ( float x)
inline

Converts a float value to a f8_ocp_t value with rounding determined by a flag.

Parameters
xThe input float value.
Returns
The converted f8_ocp_t value.

◆ type_convert< f8_ocp_t, half_t >()

template<>
__host__ __device__ f8_ocp_t ck::type_convert< f8_ocp_t, half_t > ( half_t x)
inline

Converts a half_t value to a f8_ocp_t value with rounding determined by a flag.

Parameters
xThe input half_t value.
Returns
The converted f8_ocp_t value.

◆ type_convert< f8_ocp_t, int >()

template<>
__host__ __device__ constexpr f8_ocp_t ck::type_convert< f8_ocp_t, int > ( int x)
inlineconstexpr

◆ type_convert< float, bf6_t >()

template<>
__host__ __device__ float ck::type_convert< float, bf6_t > ( bf6_t x)
inline

Specializes the type conversion template for converting a bf6_t value to float.

Interprets the bf6_t value using the default scale factor of 1 and returns its floating-point representation.

Parameters
xThe bf6_t value to convert.
Returns
The float representation of the given bf6_t value.

◆ type_convert< float, bf8_fnuz_t >()

template<>
__host__ __device__ float ck::type_convert< float, bf8_fnuz_t > ( bf8_fnuz_t x)
inline

◆ type_convert< float, bf8_ocp_t >()

template<>
__host__ __device__ float ck::type_convert< float, bf8_ocp_t > ( bf8_ocp_t x)
inline

Converts a bf8_ocp_t value to a float value.

Parameters
xThe input bf8_ocp_t value.
Returns
The converted float value.

◆ type_convert< float, bhalf_t >()

template<>
__host__ __device__ constexpr float ck::type_convert< float, bhalf_t > ( bhalf_t x)
inlineconstexpr

◆ type_convert< float, f4_t >()

template<>
__host__ __device__ float ck::type_convert< float, f4_t > ( f4_t x)
inline

◆ type_convert< float, f6_t >()

template<>
__host__ __device__ float ck::type_convert< float, f6_t > ( f6_t x)
inline

Specializes the type conversion template for converting the 6-bit float type (f6_t) to float.

Interprets an f6_t value as a float using the default scale factor of 1.

Parameters
xThe 6-bit float (f6_t) value to be converted.
Returns
The corresponding float representation.

◆ type_convert< float, f8_fnuz_t >()

template<>
__host__ __device__ float ck::type_convert< float, f8_fnuz_t > ( f8_fnuz_t x)
inline

◆ type_convert< float, f8_ocp_t >()

template<>
__host__ __device__ float ck::type_convert< float, f8_ocp_t > ( f8_ocp_t x)
inline

Converts a f8_ocp_t value to a float value.

Parameters
xThe input f8_ocp_t value.
Returns
The converted float value.

◆ type_convert< float16_t, bf6x16_pk_t >()

template<>
__host__ __device__ float16_t ck::type_convert< float16_t, bf6x16_pk_t > ( bf6x16_pk_t x)
inline

◆ type_convert< float16_t, bf6x16_t >()

template<>
__host__ __device__ float16_t ck::type_convert< float16_t, bf6x16_t > ( bf6x16_t x)
inline

◆ type_convert< float16_t, f6x16_pk_t >()

template<>
__host__ __device__ float16_t ck::type_convert< float16_t, f6x16_pk_t > ( f6x16_pk_t x)
inline

◆ type_convert< float16_t, f6x16_t >()

template<>
__host__ __device__ float16_t ck::type_convert< float16_t, f6x16_t > ( f6x16_t x)
inline

◆ type_convert< float2_t, bf8x2_ocp_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, bf8x2_ocp_t > ( bf8x2_ocp_t x)
inline

Converts a vector of 2 bf8_ocp_t values to a vector of 2 float values.

Parameters
xThe input vector of 2 bf8_ocp_t values.
Returns
The converted vector of 2 float values.

◆ type_convert< float2_t, f4x2_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, f4x2_t > ( f4x2_t x)
inline

◆ type_convert< float2_t, f8x2_fnuz_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, f8x2_fnuz_t > ( f8x2_fnuz_t x)
inline

◆ type_convert< float2_t, f8x2_ocp_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, f8x2_ocp_t > ( f8x2_ocp_t x)
inline

Converts a vector of 2 f8_ocp_t values to a vector of 2 float values.

Parameters
xThe input vector of 2 f8_ocp_t values.
Returns
The converted vector of 2 float values.

◆ type_convert< float2_t, pk_i4_t >()

template<>
__host__ __device__ float2_t ck::type_convert< float2_t, pk_i4_t > ( pk_i4_t x)
inline

◆ type_convert< float32_t, bf6x32_t >()

template<>
__host__ __device__ float32_t ck::type_convert< float32_t, bf6x32_t > ( bf6x32_t x)
inline

Specializes the type conversion template for converting a vector of 32 bf6_t values to vector of 32 floats.

Interprets the bf6x32_t value using the default scale factor of 1 and returns its floating-point representation.

Parameters
xThe bf6x32_t value to convert.
Returns
The float representation of the given vector.

◆ type_convert< float32_t, f4x32_t >()

template<>
__host__ __device__ float32_t ck::type_convert< float32_t, f4x32_t > ( f4x32_t x)
inline

◆ type_convert< float32_t, f6x32_t >()

template<>
__host__ __device__ float32_t ck::type_convert< float32_t, f6x32_t > ( f6x32_t x)
inline

Specializes the type conversion template for converting the vector of 32 6-bit float types (f6x32_t) to vector of 32 floats.

Interprets an f6_t values as floats using the default scale factor of 1.

Parameters
xThe vector of 32 6-bit float (f6x32_t) values to be converted.
Returns
The corresponding float representation.

◆ type_convert< half2_t, bf8x2_ocp_t >()

template<>
__host__ __device__ half2_t ck::type_convert< half2_t, bf8x2_ocp_t > ( bf8x2_ocp_t x)
inline

Converts a vector of 2 bf8_ocp_t values to a vector of 2 half_t values.

Parameters
xThe input vector of 2 bf8_ocp_t values.
Returns
The converted vector of 2 half_t values.

◆ type_convert< half2_t, f8x2_ocp_t >()

template<>
__host__ __device__ half2_t ck::type_convert< half2_t, f8x2_ocp_t > ( f8x2_ocp_t x)
inline

Converts a vector of 2 f8_ocp_t values to a vector of 2 half_t values.

Parameters
xThe input vector of 2 f8_ocp_t values.
Returns
The converted vector of 2 half_t values.

◆ type_convert< half2_t, float2_t >()

template<>
__host__ __device__ half2_t ck::type_convert< half2_t, float2_t > ( float2_t x)
inline

◆ type_convert< half2_t, pk_i4_t >()

template<>
__host__ __device__ half2_t ck::type_convert< half2_t, pk_i4_t > ( pk_i4_t x)
inline

◆ type_convert< half_t, bf8_fnuz_t >()

template<>
__host__ __device__ half_t ck::type_convert< half_t, bf8_fnuz_t > ( bf8_fnuz_t x)
inline

◆ type_convert< half_t, bf8_ocp_t >()

template<>
__host__ __device__ half_t ck::type_convert< half_t, bf8_ocp_t > ( bf8_ocp_t x)
inline

Converts a bf8_ocp_t value to a half_t value.

Parameters
xThe input bf8_ocp_t value.
Returns
The converted half_t value.

◆ type_convert< half_t, bhalf_t >()

template<>
__host__ __device__ constexpr half_t ck::type_convert< half_t, bhalf_t > ( bhalf_t x)
inlineconstexpr

◆ type_convert< half_t, f8_fnuz_t >()

template<>
__host__ __device__ half_t ck::type_convert< half_t, f8_fnuz_t > ( f8_fnuz_t x)
inline

◆ type_convert< half_t, f8_ocp_t >()

template<>
__host__ __device__ half_t ck::type_convert< half_t, f8_ocp_t > ( f8_ocp_t x)
inline

Converts a f8_ocp_t value to a half_t value.

Parameters
xThe input f8_ocp_t value.
Returns
The converted half_t value.

◆ type_convert< int8_t, bhalf_t >()

template<>
__host__ __device__ constexpr int8_t ck::type_convert< int8_t, bhalf_t > ( bhalf_t x)
inlineconstexpr

◆ type_convert_sp()

template<typename Y, typename X>
__host__ __device__ constexpr Y ck::type_convert_sp ( X x)
constexpr

◆ type_convert_sp< bhalf_t, float >()

template<>
__host__ __device__ constexpr bhalf_t ck::type_convert_sp< bhalf_t, float > ( float x)
inlineconstexpr

◆ type_convert_sp< bhalf_t, int >()

template<>
__host__ __device__ constexpr bhalf_t ck::type_convert_sp< bhalf_t, int > ( int x)
inlineconstexpr

◆ type_convert_sp< f8_t, int >()

template<>
__host__ __device__ constexpr f8_t ck::type_convert_sp< f8_t, int > ( int x)
inlineconstexpr

◆ type_convert_sp< float, int >()

template<>
__host__ __device__ constexpr float ck::type_convert_sp< float, int > ( int x)
inlineconstexpr

◆ type_convert_sp< half_t, float >()

template<>
__host__ __device__ constexpr half_t ck::type_convert_sp< half_t, float > ( float x)
inlineconstexpr

◆ type_convert_sp< half_t, int >()

template<>
__host__ __device__ constexpr half_t ck::type_convert_sp< half_t, int > ( int x)
inlineconstexpr

◆ type_convert_sp< int, bhalf_t >()

template<>
__host__ __device__ constexpr int ck::type_convert_sp< int, bhalf_t > ( bhalf_t x)
inlineconstexpr

◆ type_convert_sp< int, f8_t >()

template<>
__host__ __device__ constexpr int ck::type_convert_sp< int, f8_t > ( f8_t x)
inlineconstexpr

◆ type_convert_sp< int, float >()

template<>
__host__ __device__ constexpr int ck::type_convert_sp< int, float > ( float x)
inlineconstexpr

◆ type_convert_sp< int, half_t >()

template<>
__host__ __device__ constexpr int ck::type_convert_sp< int, half_t > ( half_t x)
inlineconstexpr

◆ unpack()

template<typename F, typename X>
__host__ __device__ constexpr auto ck::unpack ( F && f,
X && x )
constexpr

◆ unpack2()

template<typename F, typename X, typename Y>
__host__ __device__ constexpr auto ck::unpack2 ( F && f,
X && x,
Y && y )
constexpr

◆ UnrollNestedTuple() [1/3]

template<index_t Depth = 0, index_t MaxDepth = -1, typename T>
__host__ __device__ constexpr auto ck::UnrollNestedTuple ( const T & element)
constexpr

◆ UnrollNestedTuple() [2/3]

template<index_t Depth = 0, index_t MaxDepth = -1, typename... Ts>
__host__ __device__ constexpr auto ck::UnrollNestedTuple ( const Tuple< Ts... > & tuple)
constexpr

◆ UnrollNestedTuple() [3/3]

template<index_t Depth = 0, index_t MaxDepth = -1>
__host__ __device__ constexpr auto ck::UnrollNestedTuple ( const Tuple<> & element)
constexpr

◆ UpdateEnvVar() [1/2]

template<typename EnvVar>
void ck::UpdateEnvVar ( EnvVar ,
const std::string_view & val )

◆ UpdateEnvVar() [2/2]

template<typename EnvVar, typename ValueType>
void ck::UpdateEnvVar ( EnvVar ,
const ValueType & val )

updates the cached value of an environment variable

Variable Documentation

◆ ignore

detail::ignore_t ck::ignore
inlineconstexpr

◆ is_base_of_v

template<typename X, typename Y>
bool ck::is_base_of_v = is_base_of<X, Y>::value
inlineconstexpr

◆ is_packed_type_v

template<typename T>
bool ck::is_packed_type_v = packed_size_v<T> > 1
inlineconstexpr

◆ is_pointer_v

template<typename T>
bool ck::is_pointer_v = is_pointer<T>::value
inlineconstexpr

◆ is_same_v

template<typename X, typename Y>
bool ck::is_same_v = is_same<X, Y>::value
inlineconstexpr

◆ is_unsigned_v

template<typename T>
bool ck::is_unsigned_v = is_unsigned<T>::value
inlineconstexpr

◆ packed_size_v

template<typename T>
index_t ck::packed_size_v = packed_type_info<T>::packed_size
inlineconstexpr