69 void gemm_batch(get_value_t<A> alpha, std::vector<A>
const &va, std::vector<B>
const &vb, get_value_t<A> beta, std::vector<C> &vc) {
71 EXPECTS(va.size() == vb.size()
and va.size() == vc.size());
72 if (va.empty())
return;
73 int batch_count = va.size();
76 auto to_mat = []<
typename Z>(Z &z) ->
auto & {
77 if constexpr (is_conj_array_expr<Z>)
78 return std::get<0>(z.a);
82 auto &a0 = to_mat(va[0]);
83 auto &b0 = to_mat(vb[0]);
87 using mat_a_type =
decltype(a0);
88 using mat_b_type =
decltype(b0);
89 static_assert(mem::have_compatible_addr_space<mat_a_type, mat_b_type, C>,
"Error in nda::blas::gemm_batch: Incompatible memory address spaces");
92 if constexpr (has_C_layout<C>) {
94 auto map_transpose = [](
auto &v) {
95 auto vt = std::vector<std::decay_t<
decltype(transpose(v[0]))>>{};
97 std::transform(v.begin(), v.end(), std::back_inserter(vt), [](
auto &x) {
return transpose(x); });
100 auto vct = map_transpose(vc);
101 gemm_batch<VBATCH>(alpha, map_transpose(vb), map_transpose(va), beta, vct);
105 auto constexpr vec_adr_spc = []() {
return mem::on_host<C> ? mem::Host : mem::Unified; }();
108 auto get_ptrs = [&to_mat]<
typename V>(V &v) {
109 EXPECTS(std::all_of(v.begin(), v.end(),
110 [&v, &to_mat](
auto &z) {
return (VBATCH
or z.shape() == v[0].shape())
and to_mat(z).indexmap().min_stride() == 1; }));
111 using value_t = get_value_t<
typename V::value_type>;
112 using ptr_t = std::conditional_t<std::is_const_v<V>, value_t
const *, value_t *>;
113 auto v_ptrs = nda::vector<ptr_t, heap<vec_adr_spc>>(v.size());
114 std::transform(v.begin(), v.end(), v_ptrs.begin(), [&to_mat](
auto &z) {
return to_mat(z).data(); });
117 auto a_ptrs = get_ptrs(va);
118 auto b_ptrs = get_ptrs(vb);
119 auto c_ptrs = get_ptrs(vc);
122 static constexpr bool conj_A = is_conj_array_expr<A>;
123 static constexpr bool conj_B = is_conj_array_expr<B>;
124 char op_a = get_op<conj_A, has_C_layout<mat_a_type>>;
125 char op_b = get_op<conj_B, has_C_layout<mat_b_type>>;
128 if constexpr (VBATCH) {
130 nda::vector<
int, heap<vec_adr_spc>> vm(batch_count + 1), vk(batch_count + 1), vn(batch_count + 1), vlda(batch_count + 1),
131 vldb(batch_count + 1), vldc(batch_count + 1);
133 for (
auto i : range(batch_count)) {
134 auto &ai = to_mat(va[i]);
135 auto &bi = to_mat(vb[i]);
138 EXPECTS(ai.extent(1) == bi.extent(0));
139 EXPECTS(ai.extent(0) == ci.extent(0));
140 EXPECTS(bi.extent(1) == ci.extent(1));
142 vm[i] = ai.extent(0);
143 vk[i] = ai.extent(1);
144 vn[i] = bi.extent(1);
146 vlda[i] = get_ld(ai);
147 vldb[i] = get_ld(bi);
148 vldc[i] = get_ld(ci);
151 if constexpr (mem::have_device_compatible_addr_space<mat_a_type, mat_b_type, C>) {
152#if defined(NDA_HAVE_DEVICE)
153 device::gemm_vbatch(op_a, op_b, vm.data(), vn.data(), vk.data(), alpha, a_ptrs.data(), vlda.data(), b_ptrs.data(), vldb.data(), beta,
154 c_ptrs.data(), vldc.data(), batch_count);
159 f77::gemm_vbatch(op_a, op_b, vm.data(), vn.data(), vk.data(), alpha, a_ptrs.data(), vlda.data(), b_ptrs.data(), vldb.data(), beta,
160 c_ptrs.data(), vldc.data(), batch_count);
164 EXPECTS(a0.extent(1) == b0.extent(0));
165 EXPECTS(a0.extent(0) == c0.extent(0));
166 EXPECTS(b0.extent(1) == c0.extent(1));
168 auto [m, k] = a0.shape();
169 auto n = b0.extent(1);
171 if constexpr (mem::have_device_compatible_addr_space<mat_a_type, mat_b_type, C>) {
172#if defined(NDA_HAVE_DEVICE)
173 device::gemm_batch(op_a, op_b, m, n, k, alpha, a_ptrs.data(), get_ld(a0), b_ptrs.data(), get_ld(b0), beta, c_ptrs.data(), get_ld(c0),
179 f77::gemm_batch(op_a, op_b, m, n, k, alpha, a_ptrs.data(), get_ld(a0), b_ptrs.data(), get_ld(b0), beta, c_ptrs.data(), get_ld(c0),
223 EXPECTS(a.shape()[0] == b.shape()[0]
and a.shape()[0] == c.shape()[0]);
226 auto to_arr = []<
typename Z>(Z &z) ->
auto & {
227 if constexpr (is_conj_array_expr<Z>)
228 return std::get<0>(z.a);
232 auto arr_a = to_arr(a);
233 auto arr_b = to_arr(b);
236 using arr_a_type =
decltype(arr_a);
237 using arr_b_type =
decltype(arr_b);
238 static_assert(mem::have_compatible_addr_space<arr_a_type, arr_b_type, C>,
239 "Error in nda::blas::gemm_batch_strided: Incompatible memory address spaces");
242 auto _ = nda::range::all;
243 auto a0 = arr_a(0, _, _);
244 auto b0 = arr_b(0, _, _);
245 auto c0 = c(0, _, _);
246 EXPECTS(a0.extent(1) == b0.extent(0));
247 EXPECTS(a0.extent(0) == c0.extent(0));
248 EXPECTS(b0.extent(1) == c0.extent(1));
249 EXPECTS(arr_a.indexmap().min_stride() == 1);
250 EXPECTS(arr_b.indexmap().min_stride() == 1);
251 EXPECTS(c.indexmap().min_stride() == 1);
254 if constexpr (has_C_layout<C>) {
255 gemm_batch_strided(alpha, transposed_view<1, 2>(b), transposed_view<1, 2>(a), beta, transposed_view<1, 2>(std::forward<C>(c)));
258 static constexpr bool conj_A = is_conj_array_expr<A>;
259 static constexpr bool conj_B = is_conj_array_expr<B>;
260 char op_a = get_op<conj_A, has_C_layout<arr_a_type>>;
261 char op_b = get_op<conj_B, has_C_layout<arr_b_type>>;
262 auto [m, k] = a0.shape();
263 auto n = b0.extent(1);
265 if constexpr (mem::have_device_compatible_addr_space<arr_a_type, arr_b_type, C>) {
266#if defined(NDA_HAVE_DEVICE)
267 device::gemm_batch_strided(op_a, op_b, m, n, k, alpha, arr_a.data(), get_ld(a0), arr_a.strides()[0], arr_b.data(), get_ld(b0),
268 arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], arr_a.extent(0));
273 f77::gemm_batch_strided(op_a, op_b, m, n, k, alpha, arr_a.data(), get_ld(a0), arr_a.strides()[0], arr_b.data(), get_ld(b0),
274 arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], arr_a.extent(0));