TRIQS/nda 1.3.0
Multi-dimensional array library for C++
Loading...
Searching...
No Matches
gemm_batch.hpp
Go to the documentation of this file.
1// Copyright (c) 2022--present, The Simons Foundation
2// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
3// SPDX-License-Identifier: Apache-2.0
4// See LICENSE in the root of this distribution for details.
5
10
11#pragma once
12
14#include "./tools.hpp"
15#include "../concepts.hpp"
16#include "../declarations.hpp"
18#include "../macros.hpp"
20#include "../traits.hpp"
21
22#ifndef NDA_HAVE_DEVICE
23#include "../device.hpp"
24#endif
25
26#include <algorithm>
27#include <iterator>
28#include <tuple>
29#include <type_traits>
30#include <vector>
31
32namespace nda::blas {
33
38
55 template <bool VBATCH = false, Matrix A, Matrix B, MemoryMatrix C>
56 requires((MemoryMatrix<A> or is_conj_array_expr<A>) and (MemoryMatrix<B> or is_conj_array_expr<B>)
58 void gemm_batch(get_value_t<A> alpha, std::vector<A> const &va, std::vector<B> const &vb, get_value_t<A> beta, std::vector<C> &vc) {
59 // check sizes
60 EXPECTS(va.size() == vb.size() and va.size() == vc.size());
61 if (va.empty()) return;
62 int batch_count = va.size();
63
64 // get underlying matrix in case it is given as a lazy expression
65 auto to_mat = []<typename Z>(Z &z) -> auto & {
66 if constexpr (is_conj_array_expr<Z>)
67 return std::get<0>(z.a);
68 else
69 return z;
70 };
71 auto &a0 = to_mat(va[0]);
72 auto &b0 = to_mat(vb[0]);
73 auto &c0 = vc[0];
74
75 // compile-time checks
76 using mat_a_type = decltype(a0);
77 using mat_b_type = decltype(b0);
78 static_assert(mem::have_compatible_addr_space<mat_a_type, mat_b_type, C>, "Error in nda::blas::gemm_batch: Incompatible memory address spaces");
79
80 // c is in C order: compute the transpose of the product in Fortran order
81 if constexpr (has_C_layout<C>) {
82 // transpose each matrix in the given vector
83 auto map_transpose = [](auto &v) {
84 auto vt = std::vector<std::decay_t<decltype(transpose(v[0]))>>{};
85 vt.reserve(v.size());
86 std::transform(v.begin(), v.end(), std::back_inserter(vt), [](auto &x) { return transpose(x); });
87 return vt;
88 };
89 auto vct = map_transpose(vc);
90 gemm_batch<VBATCH>(alpha, map_transpose(vb), map_transpose(va), beta, vct);
91 return;
92 } else { // c is in Fortran order
93 // for operations on the device, use unified memory for vector of ints or ptrs
94 auto constexpr vec_adr_spc = []() { return mem::on_host<C> ? mem::Host : mem::Unified; }();
95
96 // convert the vector of matrices into the associated vector of pointers
97 auto get_ptrs = [&to_mat]<typename V>(V &v) {
98 EXPECTS(std::all_of(v.begin(), v.end(),
99 [&v, &to_mat](auto &z) { return (VBATCH or z.shape() == v[0].shape()) and to_mat(z).indexmap().min_stride() == 1; }));
101 using ptr_t = std::conditional_t<std::is_const_v<V>, value_t const *, value_t *>;
102 auto v_ptrs = nda::vector<ptr_t, heap<vec_adr_spc>>(v.size());
103 std::transform(v.begin(), v.end(), v_ptrs.begin(), [&to_mat](auto &z) { return to_mat(z).data(); });
104 return v_ptrs;
105 };
106 auto a_ptrs = get_ptrs(va);
107 auto b_ptrs = get_ptrs(vb);
108 auto c_ptrs = get_ptrs(vc);
109
110 // gather parameters for gemm call
111 static constexpr bool conj_A = is_conj_array_expr<A>;
112 static constexpr bool conj_B = is_conj_array_expr<B>;
113 char op_a = get_op<conj_A, /* transpose = */ has_C_layout<mat_a_type>>;
114 char op_b = get_op<conj_B, /* transpose = */ has_C_layout<mat_b_type>>;
115
116 // matrices have different sizes
117 if constexpr (VBATCH) {
118 // create vectors of size 'batch_count + 1' as required by Magma
119 nda::vector<int, heap<vec_adr_spc>> vm(batch_count + 1), vk(batch_count + 1), vn(batch_count + 1), vlda(batch_count + 1),
120 vldb(batch_count + 1), vldc(batch_count + 1);
121
122 for (auto i : range(batch_count)) {
123 auto &ai = to_mat(va[i]);
124 auto &bi = to_mat(vb[i]);
125 auto &ci = vc[i];
126
127 EXPECTS(ai.extent(1) == bi.extent(0));
128 EXPECTS(ai.extent(0) == ci.extent(0));
129 EXPECTS(bi.extent(1) == ci.extent(1));
130
131 vm[i] = ai.extent(0);
132 vk[i] = ai.extent(1);
133 vn[i] = bi.extent(1);
134
135 vlda[i] = get_ld(ai);
136 vldb[i] = get_ld(bi);
137 vldc[i] = get_ld(ci);
138 }
139
141#if defined(NDA_HAVE_DEVICE)
142 device::gemm_vbatch(op_a, op_b, vm.data(), vn.data(), vk.data(), alpha, a_ptrs.data(), vlda.data(), b_ptrs.data(), vldb.data(), beta,
143 c_ptrs.data(), vldc.data(), batch_count);
144#else
146#endif
147 } else {
148 f77::gemm_vbatch(op_a, op_b, vm.data(), vn.data(), vk.data(), alpha, a_ptrs.data(), vlda.data(), b_ptrs.data(), vldb.data(), beta,
149 c_ptrs.data(), vldc.data(), batch_count);
150 }
151 } else {
152 // all matrices have the same size
153 EXPECTS(a0.extent(1) == b0.extent(0));
154 EXPECTS(a0.extent(0) == c0.extent(0));
155 EXPECTS(b0.extent(1) == c0.extent(1));
156
157 auto [m, k] = a0.shape();
158 auto n = b0.extent(1);
159
161#if defined(NDA_HAVE_DEVICE)
162 device::gemm_batch(op_a, op_b, m, n, k, alpha, a_ptrs.data(), get_ld(a0), b_ptrs.data(), get_ld(b0), beta, c_ptrs.data(), get_ld(c0),
163 batch_count);
164#else
166#endif
167 } else {
168 f77::gemm_batch(op_a, op_b, m, n, k, alpha, a_ptrs.data(), get_ld(a0), b_ptrs.data(), get_ld(b0), beta, c_ptrs.data(), get_ld(c0),
169 batch_count);
170 }
171 }
172 }
173 }
174
187 template <Matrix A, Matrix B, MemoryMatrix C>
188 void gemm_vbatch(get_value_t<A> alpha, std::vector<A> const &va, std::vector<B> const &vb, get_value_t<A> beta, std::vector<C> &vc) {
189 gemm_batch</* VBATCH = */ true>(alpha, va, vb, beta, vc);
190 }
191
207 template <ArrayOfRank<3> A, ArrayOfRank<3> B, MemoryArrayOfRank<3> C>
208 requires((MemoryArrayOfRank<A, 3> or (is_conj_array_expr<A>)) and (MemoryArrayOfRank<B, 3> or (is_conj_array_expr<B>))
210 void gemm_batch_strided(get_value_t<A> alpha, A const &a, B const &b, get_value_t<A> beta, C &&c) {
211 // check number of matrices
212 EXPECTS(a.shape()[0] == b.shape()[0] and a.shape()[0] == c.shape()[0]);
213
214 // get underlying array in case it is given as a lazy expression
215 auto to_arr = []<typename Z>(Z &z) -> auto & {
216 if constexpr (is_conj_array_expr<Z>)
217 return std::get<0>(z.a);
218 else
219 return z;
220 };
221 auto arr_a = to_arr(a);
222 auto arr_b = to_arr(b);
223
224 // compile-time check
225 using arr_a_type = decltype(arr_a);
226 using arr_b_type = decltype(arr_b);
228 "Error in nda::blas::gemm_batch_strided: Incompatible memory address spaces");
229
230 // runtime checks
231 auto _ = nda::range::all;
232 auto a0 = arr_a(0, _, _);
233 auto b0 = arr_b(0, _, _);
234 auto c0 = c(0, _, _);
235 EXPECTS(a0.extent(1) == b0.extent(0));
236 EXPECTS(a0.extent(0) == c0.extent(0));
237 EXPECTS(b0.extent(1) == c0.extent(1));
238 EXPECTS(arr_a.indexmap().min_stride() == 1);
239 EXPECTS(arr_b.indexmap().min_stride() == 1);
240 EXPECTS(c.indexmap().min_stride() == 1);
241
242 // c is in C order: compute the transpose of the product in Fortran order
243 if constexpr (has_C_layout<C>) {
245 return;
246 } else { // c is in Fortran order
247 static constexpr bool conj_A = is_conj_array_expr<A>;
248 static constexpr bool conj_B = is_conj_array_expr<B>;
249 char op_a = get_op<conj_A, /* transpose = */ has_C_layout<arr_a_type>>;
250 char op_b = get_op<conj_B, /* transpose = */ has_C_layout<arr_b_type>>;
251 auto [m, k] = a0.shape();
252 auto n = b0.extent(1);
253
255#if defined(NDA_HAVE_DEVICE)
256 device::gemm_batch_strided(op_a, op_b, m, n, k, alpha, arr_a.data(), get_ld(a0), arr_a.strides()[0], arr_b.data(), get_ld(b0),
257 arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], arr_a.extent(0));
258#else
260#endif
261 } else {
262 f77::gemm_batch_strided(op_a, op_b, m, n, k, alpha, arr_a.data(), get_ld(a0), arr_a.strides()[0], arr_b.data(), get_ld(b0),
263 arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], arr_a.extent(0));
264 }
265 }
266 }
267
269
270} // namespace nda::blas
Provides definitions and type traits involving the different memory address spaces supported by nda.
Provides a C++ interface for various BLAS routines.
ValueType const * data() const noexcept
Get a pointer to the actual data (in general this is not the beginning of the memory block for a view...
Check if a given type is an nda::MemoryArray of a certain rank.
Definition concepts.hpp:275
Provides concepts for the nda library.
Provides various convenient aliases and helper functions for nda::basic_array and nda::basic_array_vi...
Provides GPU and non-GPU specific functionality.
auto transposed_view(A &&a)
Transpose two indices/dimensions of an nda::basic_array or nda::basic_array_view.
auto transpose(A &&a)
Transpose the memory layout of an nda::MemoryArray or an nda::expr_call.
basic_array< ValueType, 1, C_layout, 'V', ContainerPolicy > vector
Alias template of an nda::basic_array with rank 1 and a 'V' algebra.
constexpr bool have_same_value_type_v
Constexpr variable that is true if all types in As have the same value type as A0.
Definition traits.hpp:185
std::decay_t< decltype(get_first_element(std::declval< A const >()))> get_value_t
Get the value type of an array/view or a scalar type.
Definition traits.hpp:181
int get_ld(A const &a)
Get the leading dimension in LAPACK jargon of an nda::MemoryMatrix.
Definition tools.hpp:98
void gemm_batch_strided(get_value_t< A > alpha, A const &a, B const &b, get_value_t< A > beta, C &&c)
Implements a strided batched version of nda::blas::gemm taking 3-dimensional arrays as arguments.
static constexpr bool has_C_layout
Constexpr variable that is true if the given nda::Array type has a C memory layout.
Definition tools.hpp:65
void gemm_vbatch(get_value_t< A > alpha, std::vector< A > const &va, std::vector< B > const &vb, get_value_t< A > beta, std::vector< C > &vc)
Wrapper of nda::blas::gemm_batch that allows variable sized matrices.
static constexpr bool is_conj_array_expr
Constexpr variable that is true if the given type is a conjugate lazy expression.
Definition tools.hpp:41
void gemm_batch(get_value_t< A > alpha, std::vector< A > const &va, std::vector< B > const &vb, get_value_t< A > beta, std::vector< C > &vc)
Implements a batched version of nda::blas::gemm taking vectors of matrices as arguments.
const char get_op
Variable template that determines the BLAS matrix operation tag ('N','T','C') based on the given bool...
Definition tools.hpp:80
static constexpr bool have_compatible_addr_space
Constexpr variable that is true if all given types have compatible address spaces.
static constexpr bool have_device_compatible_addr_space
Constexpr variable that is true if all given types have an address space compatible with Device.
static constexpr bool on_host
Constexpr variable that is true if all given types have a Host address space.
void compile_error_no_gpu()
Trigger a compilation error in case GPU specific functionality is used without configuring the projec...
Definition device.hpp:36
constexpr bool is_blas_lapack_v
Alias for nda::is_double_or_complex_v.
Definition traits.hpp:91
Provides functions to transform the memory layout of an nda::basic_array or nda::basic_array_view.
Macros used in the nda library.
Provides various traits and utilities for the BLAS interface.
Provides type traits for the nda library.