threadwise_tensor_slice_transfer_v6r2.hpp Source File

threadwise_tensor_slice_transfer_v6r2.hpp Source File#

Composable Kernel: threadwise_tensor_slice_transfer_v6r2.hpp Source File
threadwise_tensor_slice_transfer_v6r2.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
10
11namespace ck {
12
13// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
14// and sometimes useless instructions:
15// 1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
16// instead
17// 2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
18// tensor coordinate instead
19// 3. Don't use a pointer to VGPR buffer, use vector instead
20
21// Assume:
22// 1. src0_desc and dst_desc are not known at compile-time
23// 2. SrcBuffer and DstBuffer are DynamicBuffer
24// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
25template <typename Src0Data,
26 typename Src1Data,
27 typename DstData,
28 typename Src0Desc,
29 typename Src1Desc,
30 typename DstDesc,
31 typename ElementwiseOperation,
32 typename SliceLengths,
33 typename DimAccessOrder,
34 index_t VectorDim,
35 index_t ScalarPerVector,
37 bool Src0ResetCoordinateAfterRun,
38 bool Src1ResetCoordinateAfterRun,
39 bool DstResetCoordinateAfterRun>
41{
42 static constexpr index_t nDim = SliceLengths::Size();
43
45
46 using Src0Coord = decltype(make_tensor_coordinate(Src0Desc{}, Index{}));
47 using Src1Coord = decltype(make_tensor_coordinate(Src1Desc{}, Index{}));
48 using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
49
50 static constexpr auto I0 = Number<0>{};
51
52 __device__ constexpr ThreadwiseTensorSliceTransfer_v6r2(const Src0Desc& src0_desc,
53 const Index& src0_slice_origin,
54 const Src1Desc& src1_desc,
55 const Index& src1_slice_origin,
56 const DstDesc& dst_desc,
57 const Index& dst_slice_origin,
58 const ElementwiseOperation& element_op)
59 : src0_coord_(make_tensor_coordinate(src0_desc, src0_slice_origin)),
60 src1_coord_(make_tensor_coordinate(src1_desc, src1_slice_origin)),
61 dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
62 element_op_(element_op)
63 {
64 static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
65 "wrong! cannot evenly divide");
66 }
67
68 __device__ void SetSrc0SliceOrigin(const Src0Desc& src0_desc,
69 const Index& src0_slice_origin_idx)
70 {
71 src0_coord_ = make_tensor_coordinate(src0_desc, src0_slice_origin_idx);
72 }
73
74 __device__ void SetSrc1SliceOrigin(const Src1Desc& src1_desc,
75 const Index& src1_slice_origin_idx)
76 {
77 src1_coord_ = make_tensor_coordinate(src1_desc, src1_slice_origin_idx);
78 }
79
80 __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
81 {
82 dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
83 }
84
85 template <typename Src0Buffer, typename Src1Buffer, typename DstBuffer>
86 __device__ void Run(const Src0Desc& src0_desc,
87 const Src0Buffer& src0_buf,
88 const Src1Desc& src1_desc,
89 const Src1Buffer& src1_buf,
90 const DstDesc& dst_desc,
91 DstBuffer& dst_buf)
92 {
93 // scalar per access on each dim
94 // TODO: don't use lambda_scalar_per_access
95 constexpr auto scalar_per_access = generate_sequence(
97
98 using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
99 DimAccessOrder,
100 remove_cv_t<decltype(scalar_per_access)>>;
101
102 constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
103
104 // loop over space-filling curve
105 static_for<0, num_access, 1>{}([&](auto idx_1d) {
106 using src0_vector_type = vector_type_maker_t<Src0Data, ScalarPerVector>;
107 using src0_vector_t = typename src0_vector_type::type;
108
109 using src1_vector_type = vector_type_maker_t<Src1Data, ScalarPerVector>;
110 using src1_vector_t = typename src1_vector_type::type;
111
112 using dst_vector_type = vector_type_maker_t<DstData, ScalarPerVector>;
113 using dst_vector_t = typename dst_vector_type::type;
114
115 const bool is_src0_valid =
117
118 const bool is_src1_valid =
120
121 // copy data from src0_buf into src0_vector_container
122 auto src0_vector_container = src0_vector_type{
123 src0_buf.template Get<src0_vector_t>(src0_coord_.GetOffset(), is_src0_valid)};
124
125 auto src1_vector_container = src1_vector_type{
126 src1_buf.template Get<src1_vector_t>(src1_coord_.GetOffset(), is_src1_valid)};
127
128 auto dst_vector_container = dst_vector_type{};
129
130 // apply pointwise operation
132 element_op_(dst_vector_container.template AsType<DstData>()(i),
133 src0_vector_container.template AsType<Src0Data>()[i],
134 src1_vector_container.template AsType<Src1Data>()[i]);
135 });
136
137 const bool is_dst_valid =
139
140 // copy data from dst_vector into dst_buf
141 dst_buf.template Update<DstInMemOp, dst_vector_t>(
142 dst_coord_.GetOffset(),
143 is_dst_valid,
144 dst_vector_container.template AsType<dst_vector_t>()[I0]);
145
146 // move coordinate
147 if constexpr(idx_1d.value != num_access - 1)
148 {
149 constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
151 src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step));
153 src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step));
155 dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
156 }
157 });
158
159 // move coordinate back to slice origin (or not)
160 if constexpr(Src0ResetCoordinateAfterRun)
161 {
162 const auto src0_reset_step =
164
165 move_tensor_coordinate(src0_desc, src0_coord_, src0_reset_step);
166 }
167
168 if constexpr(Src1ResetCoordinateAfterRun)
169 {
170 const auto src1_reset_step =
172
173 move_tensor_coordinate(src1_desc, src1_coord_, src1_reset_step);
174 }
175
176 if constexpr(DstResetCoordinateAfterRun)
177 {
178 const auto dst_reset_step =
180
181 move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
182 }
183 }
184
185 __device__ static constexpr auto GetCoordinateResetStep()
186 {
187 constexpr auto scalar_per_access = generate_sequence(
189
190 using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
191 DimAccessOrder,
192 remove_cv_t<decltype(scalar_per_access)>>;
193
194 constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
195 if constexpr(num_access == 0)
196 {
197 return typename SpaceFillingCurve::Index{};
198 }
199 else
200 {
201 constexpr auto reset_step =
203
204 return reset_step;
205 }
206 }
207
208 // src_slice_origin_step_idx need to be known at compile-time, for performance reason
209 __device__ void MoveSrc0SliceWindow(const Src0Desc& src0_desc,
210 const Index& src0_slice_origin_step_idx)
211 {
212 // if src coord was not reset by RunRead(), then need to adjust the step here
213 const auto adjusted_step_idx = Src0ResetCoordinateAfterRun
214 ? src0_slice_origin_step_idx
215 : src0_slice_origin_step_idx + GetCoordinateResetStep();
216
217 // is it OK to construct a new step every time?
218 const auto adjusted_step = make_tensor_coordinate_step(src0_desc, adjusted_step_idx);
219
220 move_tensor_coordinate(src0_desc, src0_coord_, adjusted_step);
221 }
222
223 // src_slice_origin_step_idx need to be known at compile-time, for performance reason
224 __device__ void MoveSrc1SliceWindow(const Src1Desc& src1_desc,
225 const Index& src1_slice_origin_step_idx)
226 {
227 // if src coord was not reset by RunRead(), then need to adjust the step here
228 const auto adjusted_step_idx = Src1ResetCoordinateAfterRun
229 ? src1_slice_origin_step_idx
230 : src1_slice_origin_step_idx + GetCoordinateResetStep();
231
232 // is it OK to construct a new step every time?
233 const auto adjusted_step = make_tensor_coordinate_step(src1_desc, adjusted_step_idx);
234
235 move_tensor_coordinate(src1_desc, src1_coord_, adjusted_step);
236 }
237
238 // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
239 __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
240 const Index& dst_slice_origin_step_idx)
241 {
242 // if dst coord was not reset by Run(), then need to adjust the step here
243 const auto adjusted_step_idx = DstResetCoordinateAfterRun
244 ? dst_slice_origin_step_idx
245 : dst_slice_origin_step_idx + GetCoordinateResetStep();
246
247 // is it OK to construct a new step every time?
248 const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
249
250 move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
251 }
252
253 private:
254 Src0Coord src0_coord_;
255 Src1Coord src1_coord_;
256 DstCoord dst_coord_;
257 const ElementwiseOperation element_op_;
258};
259
260} // namespace ck
Definition ck.hpp:268
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition tensor_description/tensor_descriptor.hpp:444
__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition tensor_description/tensor_descriptor.hpp:508
InMemoryDataOperationEnum
Definition ck.hpp:277
__host__ __device__ constexpr bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition tensor_description/tensor_descriptor.hpp:560
integral_constant< index_t, N > Number
Definition number.hpp:12
__host__ __device__ constexpr auto generate_sequence(F, Number< N >)
Definition sequence_helper.hpp:18
typename remove_cv< T >::type remove_cv_t
Definition type.hpp:295
__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition tensor_description/tensor_descriptor.hpp:407
Array< index_t, N > MultiIndex
Definition array_multi_index.hpp:12
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition dtype_vector.hpp:54
Definition tensor_space_filling_curve.hpp:20
static __device__ __host__ constexpr auto GetStepBetween(Number< AccessIdx1dBegin >, Number< AccessIdx1dEnd >)
Definition tensor_space_filling_curve.hpp:52
__host__ static __device__ constexpr index_t GetNumOfAccess()
Definition tensor_space_filling_curve.hpp:41
static __device__ __host__ constexpr auto GetForwardStep(Number< AccessIdx1d >)
Definition tensor_space_filling_curve.hpp:66
MultiIndex< nDim > Index
Definition tensor_space_filling_curve.hpp:23
__device__ void SetSrc0SliceOrigin(const Src0Desc &src0_desc, const Index &src0_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:68
__device__ void SetSrc1SliceOrigin(const Src1Desc &src1_desc, const Index &src1_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:74
static __device__ constexpr auto GetCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v6r2.hpp:185
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:80
__device__ void MoveSrc0SliceWindow(const Src0Desc &src0_desc, const Index &src0_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:209
__device__ void Run(const Src0Desc &src0_desc, const Src0Buffer &src0_buf, const Src1Desc &src1_desc, const Src1Buffer &src1_buf, const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:86
__device__ void MoveSrc1SliceWindow(const Src1Desc &src1_desc, const Index &src1_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:224
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:239
__device__ constexpr ThreadwiseTensorSliceTransfer_v6r2(const Src0Desc &src0_desc, const Index &src0_slice_origin, const Src1Desc &src1_desc, const Index &src1_slice_origin, const DstDesc &dst_desc, const Index &dst_slice_origin, const ElementwiseOperation &element_op)
Definition threadwise_tensor_slice_transfer_v6r2.hpp:52
Definition threadwise_tensor_slice_transfer_util.hpp:20
Definition functional2.hpp:33