|
| | StreamKTilePartitioner_v2 (ck_tile::index_t m, ck_tile::index_t n, ck_tile::index_t k, ck_tile::index_t grid) |
| CK_TILE_HOST auto | grid_size () const noexcept -> dim3 |
| | Calculates the launching grid size for the Stream-K kernel. In the Non-Persistent case, extra workgroups are allocated for the data parallel section, making the grid size the total number of Stream-K and data parallel workgroups.
|
| CK_TILE_HOST_DEVICE index_t | get_dp_ctas () const noexcept |
| | Returns the total number of DP workgroups.
|
| CK_TILE_HOST_DEVICE index_t | get_dp_start_block_idx () const noexcept |
| | Returns starting DP workgroup index. It is always zero.
|
| CK_TILE_HOST_DEVICE index_t | get_sk_start_block_idx () const noexcept |
| | The index that starts the Stream-K workgroups. It is set to the number of dp_tiles_.
|
| | StreamKTilePartitionerBase (index_t m, index_t n, index_t k, index_t grid) |
| CK_TILE_HOST_DEVICE index_t | get_partials_buffer_size (index_t acc_element_bytes) const noexcept |
| | Calculates the total space needed for the partials buffer.
|
| CK_TILE_HOST_DEVICE index_t | get_flags_buffer_size () const noexcept |
| | Calculates the total space needed for the flags buffer.
|
| CK_TILE_DEVICE void | get_iter_boundaries (index_t &iter_start, index_t &iter_end, index_t cta_idx) const noexcept |
| | Calculates the start and end iteration given the cta_idx.
|
| CK_TILE_DEVICE index_t | get_tile_index (index_t iter_start) const noexcept |
| | Calculates the 1D tile index in the C tensor for a workgroup.
|
| CK_TILE_DEVICE void | get_tile_boundaries (index_t &tile_iter_start, index_t &tile_iter_end, index_t tile_idx) const noexcept |
| | Calculates the starting and ending tile boundaries for the given 1D tile index.
|
| CK_TILE_DEVICE auto | get_output_tile_index (index_t tile_idx) const noexcept -> tuple< index_t, index_t > |
| | Calculates the workgroups 2D tile index in the C tensor given the 1D tile index.
|
| CK_TILE_HOST_DEVICE index_t | get_workspace_size (index_t acc_element_bytes) const noexcept |
| | Calculates the total space needed for the partials and flags buffers.
|
| CK_TILE_HOST_DEVICE index_t | get_num_tiles () const noexcept |
| | Returns the number of macro tiles in the C tensor.
|
| CK_TILE_HOST_DEVICE index_t | get_grid () const noexcept |
| | Returns the maximum number of active workgroups; this is assumed to be number of CUs * occupancy.
|
| CK_TILE_HOST_DEVICE index_t | get_dp_tiles () const noexcept |
| | Returns the number of tiles in the C tensor that will use the data-parallel (DP) approach.
|
| CK_TILE_HOST_DEVICE index_t | get_sk_tiles () const noexcept |
| | Returns the number of tiles in the C tensor that will use the Stream-K approach.
|
| CK_TILE_HOST_DEVICE index_t | get_sk_ctas () const noexcept |
| | Returns the number of workgroups that will participate in Stream-K in the sk_tiles_.
|
| CK_TILE_HOST_DEVICE index_t | get_total_sk_iters () const noexcept |
| | Returns the total number of Stream-K iterations.
|
| CK_TILE_HOST_DEVICE index_t | get_iters_per_tile () const noexcept |
| | Returns the total number of iterations per tile in the C tensor. In other words, this is the total number of macro tiles along the K dimension of A and B.
|
| CK_TILE_HOST_DEVICE index_t | get_iters_per_sk_cta () const noexcept |
| | Returns the total number of Stream-K iterations for each sk_cta. This is the lower bound (i.e., all sk_ctas_ are guaranteed to perform at least this many iterations).
|
| CK_TILE_HOST_DEVICE index_t | get_extra_iters () const noexcept |
| | Returns the remainder resulting from total_sk_iters_ divided by sk_ctas_. When this is non-zero, the first extra_iters_ sk_ctas_ will get one additional iteration assigned to them; such work groups will perform (iters_per_sk_cta_ + 1) iterations.
|
| CK_TILE_HOST_DEVICE index_t | get_total_dp_iters () const noexcept |
| | Returns the total number of DP iterations.
|
| CK_TILE_HOST_DEVICE index_t | get_n () const noexcept |
| | Returns the n dimension for the GEMM problem.
|
| CK_TILE_HOST index_t | estimate_num_wgs_per_tile () const noexcept |
| | Returns an estimate of the number of workgroups writing to the same macro tile in C.
|
template<typename BlockGemmShapeType,
StreamKReductionStrategy ReductionStrategyType>
struct ck_tile::StreamKTilePartitioner_v2< BlockGemmShapeType, ReductionStrategyType, false >
Non-Persistent Stream-K tile partitioner derived struct.
This partitioner is responsible for mapping workgroups to tiles in the C tensor for the Stream-K algorithm when using a Non-Persistent approach where extra workgroups are allocated for the data parallel section.
- Template Parameters
-
| BlockGemmShapeType | A class providing basic GEMM parameters. |
| ReductionStrategyType | An enum that defines the reduction strategy for the results in the C Tensor. |