31template <
typename SrcDatas,
35 typename ElementwiseOperation,
37 typename SliceLengths,
38 typename SrcDimAccessOrder,
39 typename DstDimAccessOrder,
42 typename SrcScalarPerVectors,
44 typename SrcResetCoordinateAfterRunFlags,
45 typename DstResetCoordinateAfterRunFlags,
47 typename InterDatas = DstDatas>
60 template <
typename Descs,
62 enable_if_t<Descs::Size() == Indices::Size(),
bool> =
false>
95 const SrcDescs& src_descs,
97 const DstDescs& dst_descs,
99 const ElementwiseOperation& element_op)
102 element_op_(element_op)
105 "wrong! cannot evenly divide");
108 "wrong! cannot evenly divide");
111 template <
typename Indices, enable_if_t<SrcDescs::Size() == Indices::Size(),
bool> = false>
113 const Indices& src_slice_origin_idxs)
120 template <
typename Indices, enable_if_t<DstDescs::Size() == Indices::Size(),
bool> = false>
122 const Indices& dst_slice_origin_idxs)
129 template <
typename DataTypes, index_t ScalarPerVector>
132 auto data_types = DataTypes{};
134 constexpr index_t num = data_types.Size();
147 template <
typename SrcBuffers,
149 enable_if_t<SrcDescs::Size() == SrcBuffers::Size(),
bool> =
false>
150 __device__
void RunRead(
const SrcDescs& src_descs,
151 const SrcBuffers& src_bufs,
162 static_for<0, nSrc, 1>{}([&](
auto i) {
163 using src_vector_t =
typename remove_cvref_t<
decltype(src_vectors[i])>::type;
165 const bool is_src_valid =
169 oob_val = oob_val & is_src_valid;
174 if constexpr(SrcScalarPerVectors{}[i] == 1)
176 auto data_types = SrcDatas{};
179 src_bufs[i].template Get<DataType>(src_coords_[i].GetOffset(),
true);
181 static_for<0, SrcScalarPerVector, 1>{}(
182 [&](
auto j) { src_vectors(i).template AsType<DataType>()(j) = tmp; });
186 src_vectors(i).template AsType<src_vector_t>()(
I0) =
187 src_bufs[i].
template Get<src_vector_t>(src_coords_[i].GetOffset(),
true);
191 constexpr auto get_elem_op_vec_len = []() {
194 if constexpr(
decltype(element_op_)::is_pack8_invocable)
199 if constexpr(
decltype(element_op_)::is_pack4_invocable)
204 if constexpr(
decltype(element_op_)::is_pack2_invocable)
210 constexpr index_t elem_op_vec_len = get_elem_op_vec_len();
217 [&](
auto iSrc) ->
const auto& {
220 using elem_op_vec_t =
typename vector_type<SrcData, elem_op_vec_len>::type;
222 return src_vectors[iSrc].template AsType<elem_op_vec_t>()[i];
229 [&](
auto iDst) ->
auto& {
232 using elem_op_vec_t =
233 typename vector_type<InterData, elem_op_vec_len>::type;
235 return elm_vectors(iDst).template AsType<elem_op_vec_t>()(i);
247 unpack2(element_op_, dst_data_refs, src_data_refs);
250 elm_vectors_tuple_(thread_scratch_id)(iAccess) = elm_vectors;
251 oob_vectors_tuple_(thread_scratch_id)(iAccess) = oob_val;
254 if constexpr(iAccess.value != src_num_access - 1)
258 static_for<0, nSrc, 1>{}([&](
auto i) {
267 static_for<0, nSrc, 1>{}([&](
auto i) {
268 if constexpr(SrcResetCoordinateAfterRunFlags::At(i))
270 const auto src_reset_step =
279 template <index_t ThreadScratchId = 0>
284 auto elm_vectors = elm_vectors_tuple_[thread_scratch_id][iAccess];
285 auto oob_val = oob_vectors_tuple_[thread_scratch_id][iAccess];
287 static_for<0, nDst, 1>{}([&](
auto i) {
288 using elm_vector_t =
typename remove_cvref_t<
decltype(elm_vectors[i])>::type;
289 elm_vectors(i).template AsType<elm_vector_t>()(
I0) =
290 oob_val ? elm_vectors(i).template AsType<elm_vector_t>()[
I0] : elm_vector_t{0};
293 elm_vectors_tuple_(thread_scratch_id)(iAccess) = elm_vectors;
298 template <index_t ThreadScratchId = 0>
304 using ElmThreadScratch =
310 using DstThreadScratch =
317 ElmThreadScratch elm_thread_scratch_;
318 DstThreadScratch dst_thread_scratch_;
320 elm_thread_scratch_.data_ =
321 bit_cast<
decltype(elm_thread_scratch_.data_)>(elm_vectors_tuple_[thread_scratch_id]);
323 if constexpr(SrcVectorDim != DstVectorDim &&
324 ((is_same<half_t, remove_cvref_t<InterData>>
::value &&
326 (is_same<f8_t, remove_cvref_t<InterData>>
::value &&
328 (is_same<int8_t, remove_cvref_t<InterData>>
::value &&
341 detail::lambda_scalar_step_in_vector<SrcVectorDim>{},
Number<nDim>{});
344 detail::lambda_scalar_step_in_vector<DstVectorDim>{},
Number<nDim>{});
347 detail::lambda_scalar_per_access_for_src_and_dst<SrcVectorDim,
350 DstScalarPerVector>{},
353 constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
355 static_ford<
decltype(access_lengths)>{}([&](
auto access_idx) {
356 constexpr auto data_idx = access_idx * scalar_per_access;
367 [&](
auto i) ->
const src_vector_t& {
369 return elm_thread_scratch_.GetVectorTypeReference(
370 data_idx_seq + i * dst_scalar_step_in_vector);
377 [&](
auto i) -> dst_vector_t& {
379 return dst_thread_scratch_.GetVectorTypeReference(
380 data_idx_seq + i * src_scalar_step_in_vector);
385 transpose_vectors<InterData, DstScalarPerVector, SrcScalarPerVector>{}(
386 src_vector_refs, dst_vector_refs);
391 static_ford<SliceLengths>{}(
392 [&](
auto idx) { dst_thread_scratch_(idx) = elm_thread_scratch_[idx]; });
402 template <
typename DstBuffers,
403 typename DstVgprDescs,
404 typename DstVgprBuffers,
406 enable_if_t<DstDescs::Size() == 1 && DstBuffers::Size() == 1,
bool> =
false>
411 DstVgprBuffers dst_vgpr_buf,
419 constexpr auto dst_slice_origin_idx =
421 constexpr auto dst_scalar_step_in_vector =
425 static_for<0, dst_num_access, 1>{}([&](
auto iAccess) {
426 auto dst_vectors = dst_vectors_tuple_[thread_scratch_id][iAccess];
428 static_for<0, nDst, 1>{}([&](
auto i) {
435 typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
437 static_for<0, DstScalarPerVector, 1>{}([&](
auto j) {
438 dst_vector.template AsType<DstData>()(j) =
442 const bool is_dst_valid =
449 dst_bufs(i).template Update<DstInMemOp, dst_vector_t>(
450 dst_coords_[i].GetOffset(),
452 dst_vector.template AsType<dst_vector_t>()[
I0]);
455 using DstVgprDesc =
remove_cvref_t<
decltype(DstVgprDescs{}.At(i))>;
456 static_assert(DstVgprDesc::IsKnownAtCompileTime(),
457 "wrong! DstDesc need to known at compile-time");
458 constexpr auto dst_vgpr_desc = DstVgprDesc{};
461 static_for<0, DstScalarPerVector, 1>{}([&](
auto j) {
463 dst_vgpr_desc.CalculateOffset(
to_multi_index(dst_slice_origin_idx) +
464 src_data_idx + j * dst_scalar_step_in_vector);
467 is_dst_valid ? dst_vectors[i].
template AsType<InterData>()[j]
473 if constexpr(iAccess.value != dst_num_access - 1)
477 static_for<0, nDst, 1>{}([&](
auto i) {
485 static_for<0, nDst, 1>{}([&](
auto i) {
486 if constexpr(DstResetCoordinateAfterRunFlags::At(i))
488 const auto dst_reset_step =
498 template <
typename DstBuffers,
500 enable_if_t<DstDescs::Size() == 1 && DstBuffers::Size() == 1,
bool> =
false>
501 __device__
void RunWrite(
const DstDescs& dst_descs,
506 "RunWrite doesn't support inter data type different from dst data type");
513 auto dst_vectors = dst_vectors_tuple_[thread_scratch_id][iAccess];
516 static_for<0, nDst, 1>{}([&](
auto i) {
517 using dst_vector_t =
typename remove_cvref_t<
decltype(dst_vectors[i])>::type;
519 const bool is_dst_valid =
526 dst_bufs(i).template Update<DstInMemOp, dst_vector_t>(
527 dst_coords_[i].GetOffset(),
529 dst_vectors[i].
template AsType<dst_vector_t>()[
I0]);
533 if constexpr(iAccess.value != dst_num_access - 1)
537 static_for<0, nDst, 1>{}([&](
auto i) {
545 static_for<0, nDst, 1>{}([&](
auto i) {
546 if constexpr(DstResetCoordinateAfterRunFlags::At(i))
548 const auto dst_reset_step =
560 template <
typename SrcBuffers,
562 enable_if_t<SrcDescs::Size() == SrcBuffers::Size() &&
563 DstDescs::Size() == DstBuffers::Size(),
565 __device__
void Run(
const SrcDescs& src_descs,
566 const SrcBuffers& src_bufs,
567 const DstDescs& dst_descs,
576 if constexpr(src_num_access == 0)
588 if constexpr(dst_num_access == 0)
610 constexpr auto desc0 =
616 if constexpr(i == SrcVectorDim)
619 make_tuple(src_access_lengths_and_vector_length[i],
631 if constexpr(i == SrcVectorDim)
642 constexpr auto up_dim_idss =
660 constexpr auto desc0 =
666 if constexpr(i == DstVectorDim)
669 make_tuple(dst_access_lengths_and_vector_length[i],
681 if constexpr(i == DstVectorDim)
692 constexpr auto up_dim_idss =
699 template <index_t ISrc>
702 const Index& src_slice_origin_step_idx)
705 const auto adjusted_step_idx =
706 SrcResetCoordinateAfterRunFlags::At(iSrc)
707 ? src_slice_origin_step_idx
717 template <index_t IDst>
720 const Index& dst_slice_origin_step_idx)
723 const auto adjusted_step_idx =
724 DstResetCoordinateAfterRunFlags::At(iDst)
725 ? dst_slice_origin_step_idx
753 const ElementwiseOperation element_op_;
__host__ __device__ constexpr T min(T x)
Definition utility/math.hpp:116
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
decltype(ck::declval< T & >().is_pack8_invocable) is_pack8_invocable_t
Definition is_detected.hpp:43
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition tensor_description/tensor_descriptor.hpp:444
__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition tensor_description/tensor_descriptor.hpp:508
__host__ __device__ constexpr auto container_push_back(const Array< TData, NSize > &a, const TData &x)
Definition utility/container_helper.hpp:18
InMemoryDataOperationEnum
Definition ck.hpp:277
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
decltype(ck::declval< T & >().is_pack4_invocable) is_pack4_invocable_t
Definition is_detected.hpp:40
__host__ __device__ constexpr bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition tensor_description/tensor_descriptor.hpp:560
typename detail::detector< nonesuch, void, Op, Args... >::value_t is_detected
Definition is_detected.hpp:34
integral_constant< index_t, N > Number
Definition number.hpp:12
@ Vgpr
Definition amd_address_space.hpp:20
typename tuple_element< I, TTuple >::type tuple_element_t
Definition utility/tuple.hpp:208
__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition utility/container_helper.hpp:380
__host__ __device__ constexpr Y type_convert(X x)
Definition utility/type_convert.hpp:98
__host__ __device__ constexpr auto generate_sequence_v2(F &&f, Number< N >)
Definition sequence_helper.hpp:25
__host__ __device__ constexpr auto to_multi_index(const T &x)
Definition array_multi_index.hpp:28
constexpr bool is_same_v
Definition type.hpp:283
__host__ __device__ constexpr auto make_merge_transform_v3_division_mod(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:84
__host__ __device__ constexpr auto generate_tuple(F &&f, Number< N >)
Definition tuple_helper.hpp:21
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
decltype(ck::declval< T & >().is_pack2_invocable) is_pack2_invocable_t
Definition is_detected.hpp:37
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto generate_sequence(F, Number< N >)
Definition sequence_helper.hpp:18
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
__host__ __device__ constexpr Y bit_cast(const X &x)
Definition type.hpp:306
typename remove_cv< T >::type remove_cv_t
Definition type.hpp:295
__host__ __device__ constexpr auto unpack2(F &&f, X &&x, Y &&y)
Definition functional4.hpp:55
__host__ __device__ constexpr index_t reduce_on_sequence(Seq, Reduce f, Number< Init >)
Definition utility/sequence.hpp:884
typename std::enable_if< B, T >::type enable_if_t
Definition enable_if.hpp:27
__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition tensor_description/tensor_descriptor.hpp:407
__host__ __device__ constexpr auto generate_tie(F &&f, Number< N >)
Definition tuple_helper.hpp:34
Array< index_t, N > MultiIndex
Definition array_multi_index.hpp:12
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition dtype_vector.hpp:54
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
__host__ static __device__ constexpr T QuietNaN()
Definition numeric_limits.hpp:313
Definition utility/sequence.hpp:43
Definition tensor_space_filling_curve.hpp:20
static __device__ __host__ constexpr auto GetStepBetween(Number< AccessIdx1dBegin >, Number< AccessIdx1dEnd >)
Definition tensor_space_filling_curve.hpp:52
__host__ static __device__ constexpr index_t GetNumOfAccess()
Definition tensor_space_filling_curve.hpp:41
static __device__ __host__ constexpr Index GetIndex(Number< AccessIdx1d >)
Definition tensor_space_filling_curve.hpp:81
static __device__ __host__ constexpr auto GetForwardStep(Number< AccessIdx1d >)
Definition tensor_space_filling_curve.hpp:66
MultiIndex< nDim > Index
Definition tensor_space_filling_curve.hpp:23
__device__ void MoveSrcSliceWindow(const SrcDescs &src_descs, Number< ISrc > iSrc, const Index &src_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v7r3.hpp:700
ck::ThreadwiseTensorSliceTransfer_v7r3< SrcDatas, DstDatas, SrcDescs, DstDescs, ElementwiseOperation, DstInMemOps, decltype(thread_slice_lengths), SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVectors, DstScalarPerVector, ThreadTransferSrcResetCoordinateAfterRunFlags, ThreadTransferDstResetCoordinateAfterRunFlags, NumThreadScratch, InterDatas >::SrcCoords decltype(MakeCoordinates(SrcDescs{}, StaticallyIndexedArray< Index, nSrc >{})) SrcCoords
Definition threadwise_tensor_slice_transfer_v7r3.hpp:73
ck::ThreadwiseTensorSliceTransfer_v7r3< SrcDatas, DstDatas, SrcDescs, DstDescs, ElementwiseOperation, DstInMemOps, decltype(thread_slice_lengths), SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVectors, DstScalarPerVector, ThreadTransferSrcResetCoordinateAfterRunFlags, ThreadTransferDstResetCoordinateAfterRunFlags, NumThreadScratch, InterDatas >::SrcSpaceFillingCurve SpaceFillingCurve< decltype(thread_slice_lengths), SrcDimAccessOrder, remove_cv_t< decltype(src_scalar_per_access)>, false > SrcSpaceFillingCurve
Definition threadwise_tensor_slice_transfer_v7r3.hpp:84
__device__ void SetDstSliceOrigins(const DstDescs &dst_descs, const Indices &dst_slice_origin_idxs)
Definition threadwise_tensor_slice_transfer_v7r3.hpp:121
ck::ThreadwiseTensorSliceTransfer_v7r3< SrcDatas, DstDatas, SrcDescs, DstDescs, ElementwiseOperation, DstInMemOps, decltype(thread_slice_lengths), SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVectors, DstScalarPerVector, ThreadTransferSrcResetCoordinateAfterRunFlags, ThreadTransferDstResetCoordinateAfterRunFlags, NumThreadScratch, InterDatas >::src_scalar_per_access static constexpr auto src_scalar_per_access
Definition threadwise_tensor_slice_transfer_v7r3.hpp:78
ck::ThreadwiseTensorSliceTransfer_v7r3< SrcDatas, DstDatas, SrcDescs, DstDescs, ElementwiseOperation, DstInMemOps, decltype(thread_slice_lengths), SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVectors, DstScalarPerVector, ThreadTransferSrcResetCoordinateAfterRunFlags, ThreadTransferDstResetCoordinateAfterRunFlags, NumThreadScratch, InterDatas >::nSrc static constexpr index_t nSrc
Definition threadwise_tensor_slice_transfer_v7r3.hpp:54
__device__ void Run(const SrcDescs &src_descs, const SrcBuffers &src_bufs, const DstDescs &dst_descs, DstBuffers dst_bufs)
Definition threadwise_tensor_slice_transfer_v7r3.hpp:565
static __device__ constexpr auto GetSrcCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v7r3.hpp:574
__device__ void RunRead(const SrcDescs &src_descs, const SrcBuffers &src_bufs, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v7r3.hpp:150
ck::ThreadwiseTensorSliceTransfer_v7r3< SrcDatas, DstDatas, SrcDescs, DstDescs, ElementwiseOperation, DstInMemOps, decltype(thread_slice_lengths), SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVectors, DstScalarPerVector, ThreadTransferSrcResetCoordinateAfterRunFlags, ThreadTransferDstResetCoordinateAfterRunFlags, NumThreadScratch, InterDatas >::nDim static constexpr index_t nDim
Definition threadwise_tensor_slice_transfer_v7r3.hpp:52
__device__ void SetSrcSliceOrigins(const SrcDescs &src_descs, const Indices &src_slice_origin_idxs)
Definition threadwise_tensor_slice_transfer_v7r3.hpp:112
ck::ThreadwiseTensorSliceTransfer_v7r3< SrcDatas, DstDatas, SrcDescs, DstDescs, ElementwiseOperation, DstInMemOps, decltype(thread_slice_lengths), SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVectors, DstScalarPerVector, ThreadTransferSrcResetCoordinateAfterRunFlags, ThreadTransferDstResetCoordinateAfterRunFlags, NumThreadScratch, InterDatas >::I0 static constexpr auto I0
Definition threadwise_tensor_slice_transfer_v7r3.hpp:50
static __device__ constexpr auto GetDstThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v7r3.hpp:648
__device__ void RunWrite(const DstDescs &dst_descs, DstBuffers dst_bufs, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v7r3.hpp:501
ck::ThreadwiseTensorSliceTransfer_v7r3< SrcDatas, DstDatas, SrcDescs, DstDescs, ElementwiseOperation, DstInMemOps, decltype(thread_slice_lengths), SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVectors, DstScalarPerVector, ThreadTransferSrcResetCoordinateAfterRunFlags, ThreadTransferDstResetCoordinateAfterRunFlags, NumThreadScratch, InterDatas >::nDst static constexpr index_t nDst
Definition threadwise_tensor_slice_transfer_v7r3.hpp:55
ck::ThreadwiseTensorSliceTransfer_v7r3< SrcDatas, DstDatas, SrcDescs, DstDescs, ElementwiseOperation, DstInMemOps, decltype(thread_slice_lengths), SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVectors, DstScalarPerVector, ThreadTransferSrcResetCoordinateAfterRunFlags, ThreadTransferDstResetCoordinateAfterRunFlags, NumThreadScratch, InterDatas >::DstSpaceFillingCurve SpaceFillingCurve< decltype(thread_slice_lengths), DstDimAccessOrder, remove_cv_t< decltype(dst_scalar_per_access)>, false > DstSpaceFillingCurve
Definition threadwise_tensor_slice_transfer_v7r3.hpp:89
ck::ThreadwiseTensorSliceTransfer_v7r3< SrcDatas, DstDatas, SrcDescs, DstDescs, ElementwiseOperation, DstInMemOps, decltype(thread_slice_lengths), SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVectors, DstScalarPerVector, ThreadTransferSrcResetCoordinateAfterRunFlags, ThreadTransferDstResetCoordinateAfterRunFlags, NumThreadScratch, InterDatas >::Index MultiIndex< nDim > Index
Definition threadwise_tensor_slice_transfer_v7r3.hpp:57
ck::ThreadwiseTensorSliceTransfer_v7r3< SrcDatas, DstDatas, SrcDescs, DstDescs, ElementwiseOperation, DstInMemOps, decltype(thread_slice_lengths), SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVectors, DstScalarPerVector, ThreadTransferSrcResetCoordinateAfterRunFlags, ThreadTransferDstResetCoordinateAfterRunFlags, NumThreadScratch, InterDatas >::DstCoords decltype(MakeCoordinates(DstDescs{}, StaticallyIndexedArray< Index, nDst >{})) DstCoords
Definition threadwise_tensor_slice_transfer_v7r3.hpp:74
static constexpr auto MakeCoordinates(const Descs &descs, const Indices &indices)
Definition threadwise_tensor_slice_transfer_v7r3.hpp:63
__device__ constexpr ThreadwiseTensorSliceTransfer_v7r3(const SrcDescs &src_descs, const StaticallyIndexedArray< Index, nSrc > &src_slice_origins, const DstDescs &dst_descs, const StaticallyIndexedArray< Index, nDst > &dst_slice_origins, const ElementwiseOperation &element_op)
Definition threadwise_tensor_slice_transfer_v7r3.hpp:94
__device__ void RunWriteAndStoreVgpr(const DstDescs &dst_descs, DstBuffers dst_bufs, const DstVgprDescs &, DstVgprBuffers dst_vgpr_buf, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v7r3.hpp:408
__device__ void OOBCheck(Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v7r3.hpp:280
static __device__ constexpr auto GetSrcThreadScratchDescriptor()
Definition threadwise_tensor_slice_transfer_v7r3.hpp:598
__device__ void TransposeFromElmToDst(Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{})
Definition threadwise_tensor_slice_transfer_v7r3.hpp:300
static __device__ auto generate_vectors()
Definition threadwise_tensor_slice_transfer_v7r3.hpp:130
ck::ThreadwiseTensorSliceTransfer_v7r3< SrcDatas, DstDatas, SrcDescs, DstDescs, ElementwiseOperation, DstInMemOps, decltype(thread_slice_lengths), SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVectors, DstScalarPerVector, ThreadTransferSrcResetCoordinateAfterRunFlags, ThreadTransferDstResetCoordinateAfterRunFlags, NumThreadScratch, InterDatas >::dst_scalar_per_access static constexpr auto dst_scalar_per_access
Definition threadwise_tensor_slice_transfer_v7r3.hpp:81
static __device__ constexpr auto GetDstCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v7r3.hpp:586
ck::ThreadwiseTensorSliceTransfer_v7r3< SrcDatas, DstDatas, SrcDescs, DstDescs, ElementwiseOperation, DstInMemOps, decltype(thread_slice_lengths), SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVectors, DstScalarPerVector, ThreadTransferSrcResetCoordinateAfterRunFlags, ThreadTransferDstResetCoordinateAfterRunFlags, NumThreadScratch, InterDatas >::SrcScalarPerVector static constexpr auto SrcScalarPerVector
Definition threadwise_tensor_slice_transfer_v7r3.hpp:69
__device__ void MoveDstSliceWindow(const DstDescs &dst_descs, Number< IDst > iDst, const Index &dst_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v7r3.hpp:718
Definition threadwise_tensor_slice_transfer_util.hpp:20
Definition utility/math.hpp:50
Definition functional2.hpp:33
vector_type< T, N > type
Definition dtype_vector.hpp:31