TRIQS/nda 1.3.0
Multi-dimensional array library for C++
Loading...
Searching...
No Matches
gemm_batch.hpp
Go to the documentation of this file.
1// Copyright (c) 2022-2023 Simons Foundation
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0.txt
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15// Authors: Miguel Morales, Nils Wentzell
16
17/**
18 * @file
19 * @brief Provides a generic interface to batched versions of the BLAS `gemm` routine.
20 */
21
22#pragma once
23
24#include "./interface/cxx_interface.hpp"
25#include "./tools.hpp"
26#include "../concepts.hpp"
27#include "../declarations.hpp"
28#include "../layout_transforms.hpp"
29#include "../macros.hpp"
30#include "../mem/address_space.hpp"
31#include "../traits.hpp"
32
33#ifndef NDA_HAVE_DEVICE
34#include "../device.hpp"
35#endif
36
37#include <algorithm>
38#include <iterator>
39#include <tuple>
40#include <type_traits>
41#include <vector>
42
43namespace nda::blas {
44
45 /**
46 * @addtogroup linalg_blas
47 * @{
48 */
49
50 /**
51 * @brief Implements a batched version of nda::blas::gemm taking vectors of matrices as arguments.
52 *
53 * @details This routine is a batched version of nda::blas::gemm, performing multiple `gemm` operations in a single
54 * call. Each `gemm` operation performs a matrix-matrix product with general matrices.
55 *
56 * @tparam VBATCH Allow for variable sized matrices.
57 * @tparam A nda::Matrix type.
58 * @tparam B nda::Matrix type.
59 * @tparam C nda::MemoryMatrix type.
60 * @param alpha Input scalar.
61 * @param va std::vector of input matrices.
62 * @param vb std::vector of input matrices.
63 * @param beta Input scalar.
64 * @param vc std::vector of input/output matrices.
65 */
66 template <bool VBATCH = false, Matrix A, Matrix B, MemoryMatrix C>
67 requires((MemoryMatrix<A> or is_conj_array_expr<A>) and (MemoryMatrix<B> or is_conj_array_expr<B>)
68 and have_same_value_type_v<A, B, C> and is_blas_lapack_v<get_value_t<A>>)
69 void gemm_batch(get_value_t<A> alpha, std::vector<A> const &va, std::vector<B> const &vb, get_value_t<A> beta, std::vector<C> &vc) {
70 // check sizes
71 EXPECTS(va.size() == vb.size() and va.size() == vc.size());
72 if (va.empty()) return;
73 int batch_count = va.size();
74
75 // get underlying matrix in case it is given as a lazy expression
76 auto to_mat = []<typename Z>(Z &z) -> auto & {
77 if constexpr (is_conj_array_expr<Z>)
78 return std::get<0>(z.a);
79 else
80 return z;
81 };
82 auto &a0 = to_mat(va[0]);
83 auto &b0 = to_mat(vb[0]);
84 auto &c0 = vc[0];
85
86 // compile-time checks
87 using mat_a_type = decltype(a0);
88 using mat_b_type = decltype(b0);
89 static_assert(mem::have_compatible_addr_space<mat_a_type, mat_b_type, C>, "Error in nda::blas::gemm_batch: Incompatible memory address spaces");
90
91 // c is in C order: compute the transpose of the product in Fortran order
92 if constexpr (has_C_layout<C>) {
93 // transpose each matrix in the given vector
94 auto map_transpose = [](auto &v) {
95 auto vt = std::vector<std::decay_t<decltype(transpose(v[0]))>>{};
96 vt.reserve(v.size());
97 std::transform(v.begin(), v.end(), std::back_inserter(vt), [](auto &x) { return transpose(x); });
98 return vt;
99 };
100 auto vct = map_transpose(vc);
101 gemm_batch<VBATCH>(alpha, map_transpose(vb), map_transpose(va), beta, vct);
102 return;
103 } else { // c is in Fortran order
104 // for operations on the device, use unified memory for vector of ints or ptrs
105 auto constexpr vec_adr_spc = []() { return mem::on_host<C> ? mem::Host : mem::Unified; }();
106
107 // convert the vector of matrices into the associated vector of pointers
108 auto get_ptrs = [&to_mat]<typename V>(V &v) {
109 EXPECTS(std::all_of(v.begin(), v.end(),
110 [&v, &to_mat](auto &z) { return (VBATCH or z.shape() == v[0].shape()) and to_mat(z).indexmap().min_stride() == 1; }));
111 using value_t = get_value_t<typename V::value_type>;
112 using ptr_t = std::conditional_t<std::is_const_v<V>, value_t const *, value_t *>;
113 auto v_ptrs = nda::vector<ptr_t, heap<vec_adr_spc>>(v.size());
114 std::transform(v.begin(), v.end(), v_ptrs.begin(), [&to_mat](auto &z) { return to_mat(z).data(); });
115 return v_ptrs;
116 };
117 auto a_ptrs = get_ptrs(va);
118 auto b_ptrs = get_ptrs(vb);
119 auto c_ptrs = get_ptrs(vc);
120
121 // gather parameters for gemm call
122 static constexpr bool conj_A = is_conj_array_expr<A>;
123 static constexpr bool conj_B = is_conj_array_expr<B>;
124 char op_a = get_op<conj_A, /* transpose = */ has_C_layout<mat_a_type>>;
125 char op_b = get_op<conj_B, /* transpose = */ has_C_layout<mat_b_type>>;
126
127 // matrices have different sizes
128 if constexpr (VBATCH) {
129 // create vectors of size 'batch_count + 1' as required by Magma
130 nda::vector<int, heap<vec_adr_spc>> vm(batch_count + 1), vk(batch_count + 1), vn(batch_count + 1), vlda(batch_count + 1),
131 vldb(batch_count + 1), vldc(batch_count + 1);
132
133 for (auto i : range(batch_count)) {
134 auto &ai = to_mat(va[i]);
135 auto &bi = to_mat(vb[i]);
136 auto &ci = vc[i];
137
138 EXPECTS(ai.extent(1) == bi.extent(0));
139 EXPECTS(ai.extent(0) == ci.extent(0));
140 EXPECTS(bi.extent(1) == ci.extent(1));
141
142 vm[i] = ai.extent(0);
143 vk[i] = ai.extent(1);
144 vn[i] = bi.extent(1);
145
146 vlda[i] = get_ld(ai);
147 vldb[i] = get_ld(bi);
148 vldc[i] = get_ld(ci);
149 }
150
151 if constexpr (mem::have_device_compatible_addr_space<mat_a_type, mat_b_type, C>) {
152#if defined(NDA_HAVE_DEVICE)
153 device::gemm_vbatch(op_a, op_b, vm.data(), vn.data(), vk.data(), alpha, a_ptrs.data(), vlda.data(), b_ptrs.data(), vldb.data(), beta,
154 c_ptrs.data(), vldc.data(), batch_count);
155#else
157#endif
158 } else {
159 f77::gemm_vbatch(op_a, op_b, vm.data(), vn.data(), vk.data(), alpha, a_ptrs.data(), vlda.data(), b_ptrs.data(), vldb.data(), beta,
160 c_ptrs.data(), vldc.data(), batch_count);
161 }
162 } else {
163 // all matrices have the same size
164 EXPECTS(a0.extent(1) == b0.extent(0));
165 EXPECTS(a0.extent(0) == c0.extent(0));
166 EXPECTS(b0.extent(1) == c0.extent(1));
167
168 auto [m, k] = a0.shape();
169 auto n = b0.extent(1);
170
171 if constexpr (mem::have_device_compatible_addr_space<mat_a_type, mat_b_type, C>) {
172#if defined(NDA_HAVE_DEVICE)
173 device::gemm_batch(op_a, op_b, m, n, k, alpha, a_ptrs.data(), get_ld(a0), b_ptrs.data(), get_ld(b0), beta, c_ptrs.data(), get_ld(c0),
174 batch_count);
175#else
177#endif
178 } else {
179 f77::gemm_batch(op_a, op_b, m, n, k, alpha, a_ptrs.data(), get_ld(a0), b_ptrs.data(), get_ld(b0), beta, c_ptrs.data(), get_ld(c0),
180 batch_count);
181 }
182 }
183 }
184 }
185
186 /**
187 * @brief Wrapper of nda::blas::gemm_batch that allows variable sized matrices.
188 *
189 * @tparam A nda::Matrix type.
190 * @tparam B nda::Matrix type.
191 * @tparam C nda::MemoryMatrix type.
192 * @param alpha Input scalar.
193 * @param va std::vector of input matrices.
194 * @param vb std::vector of input matrices.
195 * @param beta Input scalar.
196 * @param vc std::vector of input/output matrices.
197 */
198 template <Matrix A, Matrix B, MemoryMatrix C>
199 void gemm_vbatch(get_value_t<A> alpha, std::vector<A> const &va, std::vector<B> const &vb, get_value_t<A> beta, std::vector<C> &vc) {
200 gemm_batch</* VBATCH = */ true>(alpha, va, vb, beta, vc);
201 }
202
203 /**
204 * @brief Implements a strided batched version of nda::blas::gemm taking 3-dimensional arrays as arguments.
205 *
206 * @details This function is similar to nda::blas::gemm_batch except that it takes 3-dimensional arrays as arguments
207 * instead of vectors of matrices. The first dimension of the arrays indexes the matrices to be multiplied.
208 *
209 * @tparam A nda::ArrayOfRank<3> type.
210 * @tparam B nda::ArrayOfRank<3> type.
211 * @tparam C nda::ArrayOfRank<3> type.
212 * @param alpha Input scalar.
213 * @param a 3-dimensional input array.
214 * @param b 3-dimensional input array.
215 * @param beta Input scalar.
216 * @param c 3-dimensional input/output array.
217 */
218 template <ArrayOfRank<3> A, ArrayOfRank<3> B, MemoryArrayOfRank<3> C>
219 requires((MemoryArrayOfRank<A, 3> or (is_conj_array_expr<A>)) and (MemoryArrayOfRank<B, 3> or (is_conj_array_expr<B>))
220 and have_same_value_type_v<A, B, C> and is_blas_lapack_v<get_value_t<A>>)
221 void gemm_batch_strided(get_value_t<A> alpha, A const &a, B const &b, get_value_t<A> beta, C &&c) {
222 // check number of matrices
223 EXPECTS(a.shape()[0] == b.shape()[0] and a.shape()[0] == c.shape()[0]);
224
225 // get underlying array in case it is given as a lazy expression
226 auto to_arr = []<typename Z>(Z &z) -> auto & {
227 if constexpr (is_conj_array_expr<Z>)
228 return std::get<0>(z.a);
229 else
230 return z;
231 };
232 auto arr_a = to_arr(a);
233 auto arr_b = to_arr(b);
234
235 // compile-time check
236 using arr_a_type = decltype(arr_a);
237 using arr_b_type = decltype(arr_b);
238 static_assert(mem::have_compatible_addr_space<arr_a_type, arr_b_type, C>,
239 "Error in nda::blas::gemm_batch_strided: Incompatible memory address spaces");
240
241 // runtime checks
242 auto _ = nda::range::all;
243 auto a0 = arr_a(0, _, _);
244 auto b0 = arr_b(0, _, _);
245 auto c0 = c(0, _, _);
246 EXPECTS(a0.extent(1) == b0.extent(0));
247 EXPECTS(a0.extent(0) == c0.extent(0));
248 EXPECTS(b0.extent(1) == c0.extent(1));
249 EXPECTS(arr_a.indexmap().min_stride() == 1);
250 EXPECTS(arr_b.indexmap().min_stride() == 1);
251 EXPECTS(c.indexmap().min_stride() == 1);
252
253 // c is in C order: compute the transpose of the product in Fortran order
254 if constexpr (has_C_layout<C>) {
255 gemm_batch_strided(alpha, transposed_view<1, 2>(b), transposed_view<1, 2>(a), beta, transposed_view<1, 2>(std::forward<C>(c)));
256 return;
257 } else { // c is in Fortran order
258 static constexpr bool conj_A = is_conj_array_expr<A>;
259 static constexpr bool conj_B = is_conj_array_expr<B>;
260 char op_a = get_op<conj_A, /* transpose = */ has_C_layout<arr_a_type>>;
261 char op_b = get_op<conj_B, /* transpose = */ has_C_layout<arr_b_type>>;
262 auto [m, k] = a0.shape();
263 auto n = b0.extent(1);
264
265 if constexpr (mem::have_device_compatible_addr_space<arr_a_type, arr_b_type, C>) {
266#if defined(NDA_HAVE_DEVICE)
267 device::gemm_batch_strided(op_a, op_b, m, n, k, alpha, arr_a.data(), get_ld(a0), arr_a.strides()[0], arr_b.data(), get_ld(b0),
268 arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], arr_a.extent(0));
269#else
271#endif
272 } else {
273 f77::gemm_batch_strided(op_a, op_b, m, n, k, alpha, arr_a.data(), get_ld(a0), arr_a.strides()[0], arr_b.data(), get_ld(b0),
274 arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], arr_a.extent(0));
275 }
276 }
277 }
278
279 /** @} */
280
281} // namespace nda::blas
void gemm_batch_strided(get_value_t< A > alpha, A const &a, B const &b, get_value_t< A > beta, C &&c)
Implements a strided batched version of nda::blas::gemm taking 3-dimensional arrays as arguments.
void gemm_vbatch(get_value_t< A > alpha, std::vector< A > const &va, std::vector< B > const &vb, get_value_t< A > beta, std::vector< C > &vc)
Wrapper of nda::blas::gemm_batch that allows variable sized matrices.
void gemm_batch(get_value_t< A > alpha, std::vector< A > const &va, std::vector< B > const &vb, get_value_t< A > beta, std::vector< C > &vc)
Implements a batched version of nda::blas::gemm taking vectors of matrices as arguments.
void compile_error_no_gpu()
Trigger a compilation error in case GPU specific functionality is used without configuring the projec...
Definition device.hpp:47
#define EXPECTS(X)
Definition macros.hpp:59