TRIQS/nda 1.3.0
Multi-dimensional array library for C++
Loading...
Searching...
No Matches
gemm_batch.hpp
Go to the documentation of this file.
1// Copyright (c) 2022--present, The Simons Foundation
2// This file is part of TRIQS/nda and is licensed under the Apache License, Version 2.0.
3// SPDX-License-Identifier: Apache-2.0
4// See LICENSE in the root of this distribution for details.
5
10
11#pragma once
12
14#include "./tools.hpp"
15#include "../concepts.hpp"
16#include "../declarations.hpp"
18#include "../macros.hpp"
20#include "../traits.hpp"
21
22#ifndef NDA_HAVE_DEVICE
23#include "../device.hpp"
24#endif // NDA_HAVE_DEVICE
25
26#include <algorithm>
27#include <iterator>
28#include <tuple>
29#include <type_traits>
30#include <vector>
31
32namespace nda::blas {
33
38
39 namespace detail {
40
41 // Get a vector of transpose matrices from a given vector of matrices.
42 auto get_transpose_vector(auto &&v) {
43 auto v_t = std::vector<std::decay_t<decltype(transpose(v[0]))>>{};
44 v_t.reserve(v.size());
45 std::transform(v.begin(), v.end(), std::back_inserter(v_t), [](auto &x) { return transpose(x); });
46 return v_t;
47 }
48
49 // Get a vector of pointers to the memory of matrices from a given vector of matrices.
50 template <bool is_vbatch, nda::mem::AddressSpace vec_addr_spc>
51 auto get_ptr_vector(auto &&v) {
52 EXPECTS(std::ranges::all_of(v, [&v](auto &A) { return is_vbatch or A.shape() == v[0].shape(); }));
53 EXPECTS(std::ranges::all_of(v, [](auto &A) { return get_array(A).indexmap().min_stride() == 1; }));
54 using ptr_t = std::remove_reference_t<decltype(get_first_element(v[0]))> *;
55 auto v_ptrs = nda::vector<ptr_t, heap<vec_addr_spc>>(v.size());
56 std::transform(v.begin(), v.end(), v_ptrs.begin(), [](auto &z) { return get_array(z).data(); });
57 return v_ptrs;
58 }
59
60 } // namespace detail
61
83 template <bool is_vbatch = false, Matrix A, Matrix B, MemoryMatrix C>
84 requires((MemoryMatrix<A> or is_conj_array_expr<A>) and (MemoryMatrix<B> or is_conj_array_expr<B>)
86 void gemm_batch(get_value_t<A> alpha, std::vector<A> const &va, std::vector<B> const &vb, get_value_t<A> beta, std::vector<C> &vc) {
87 // check sizes of input vectors and return if they are empty
88 EXPECTS(va.size() == vb.size() and va.size() == vc.size());
89 if (va.empty()) return;
90 auto const batch_count = va.size();
91
92 // if C is in C-layout, compute the transpose of the product in Fortran order
93 if constexpr (has_C_layout<C>) {
94 auto vcT = detail::get_transpose_vector(vc);
95 return gemm_batch<is_vbatch>(alpha, detail::get_transpose_vector(vb), detail::get_transpose_vector(va), beta, vcT);
96 } else {
97 // for operations on the device, use unified memory for vector of ints or ptrs
98 auto constexpr vec_addr_spc = []() { return mem::on_host<C> ? mem::Host : mem::Unified; }();
99
100 // convert the vector of matrices to the corresponding vector of pointers
101 auto a_ptrs = detail::get_ptr_vector<is_vbatch, vec_addr_spc>(va);
102 auto b_ptrs = detail::get_ptr_vector<is_vbatch, vec_addr_spc>(vb);
103 auto c_ptrs = detail::get_ptr_vector<is_vbatch, vec_addr_spc>(vc);
104
105 // either call gemm_vbatch or gemm_batch
106 if constexpr (is_vbatch) {
107 // create vectors to store shapes and leading dimensions of size 'batch_count + 1' as required by Magma
108 nda::vector<int, heap<vec_addr_spc>> vm(batch_count + 1), vk(batch_count + 1), vn(batch_count + 1), vlda(batch_count + 1),
109 vldb(batch_count + 1), vldc(batch_count + 1);
110
111 for (auto i : range(batch_count)) {
112 auto &&mat_a = get_array(va[i]);
113 auto &&mat_b = get_array(vb[i]);
114 auto &&mat_c = get_array(vc[i]);
115
116 // check the dimensions of the input/output arrays/views
117 auto const [m, k] = mat_a.shape();
118 auto const [l, n] = mat_b.shape();
119 EXPECTS(k == l);
120 EXPECTS(m == mat_c.extent(0));
121 EXPECTS(n == mat_c.extent(1));
122
123 // store shapes and leading dimensions
124 vm[i] = m;
125 vk[i] = k;
126 vn[i] = n;
127 vlda[i] = get_ld(mat_a);
128 vldb[i] = get_ld(mat_b);
129 vldc[i] = get_ld(mat_c);
130 }
131
132 // perform the actual library call
134#if defined(NDA_HAVE_DEVICE)
135 device::gemm_vbatch(get_op<A>, get_op<B>, vm.data(), vn.data(), vk.data(), alpha, a_ptrs.data(), vlda.data(), b_ptrs.data(), vldb.data(),
136 beta, c_ptrs.data(), vldc.data(), batch_count);
137#else
139#endif
140 } else {
141 f77::gemm_vbatch(get_op<A>, get_op<B>, vm.data(), vn.data(), vk.data(), alpha, a_ptrs.data(), vlda.data(), b_ptrs.data(), vldb.data(), beta,
142 c_ptrs.data(), vldc.data(), batch_count);
143 }
144 } else {
145 auto &&mat_a = get_array(va[0]);
146 auto &&mat_b = get_array(vb[0]);
147 auto &&mat_c = get_array(vc[0]);
148
149 // check the dimensions of the input/output arrays/views
150 auto const [m, k] = mat_a.shape();
151 auto const [l, n] = mat_b.shape();
152 EXPECTS(k == l);
153 EXPECTS(m == mat_c.extent(0));
154 EXPECTS(n == mat_c.extent(1));
155
156 // perform the actual library call
158#if defined(NDA_HAVE_DEVICE)
159 device::gemm_batch(get_op<A>, get_op<B>, m, n, k, alpha, a_ptrs.data(), get_ld(mat_a), b_ptrs.data(), get_ld(mat_b), beta, c_ptrs.data(),
160 get_ld(mat_c), batch_count);
161#else
163#endif
164 } else {
165 f77::gemm_batch(get_op<A>, get_op<B>, m, n, k, alpha, a_ptrs.data(), get_ld(mat_a), b_ptrs.data(), get_ld(mat_b), beta, c_ptrs.data(),
166 get_ld(mat_c), batch_count);
167 }
168 }
169 }
170 }
171
186 template <Matrix A, Matrix B, MemoryMatrix C>
187 void gemm_vbatch(get_value_t<A> alpha, std::vector<A> const &va, std::vector<B> const &vb, get_value_t<A> beta, std::vector<C> &vc) {
188 gemm_batch<true>(alpha, va, vb, beta, vc);
189 }
190
206 template <ArrayOfRank<3> A, ArrayOfRank<3> B, MemoryArrayOfRank<3> C>
207 requires((MemoryArrayOfRank<A, 3> or is_conj_array_expr<A>) and (MemoryArrayOfRank<B, 3> or is_conj_array_expr<B>)
209 void gemm_batch_strided(get_value_t<A> alpha, A const &a, B const &b, get_value_t<A> beta, C &&c) {
210 // check sizes of input arrays (number of matrices) and return if they are empty
211 EXPECTS(a.shape()[0] == b.shape()[0] and a.shape()[0] == c.shape()[0]);
212 if (a.size() == 0) return;
213 auto const batch_count = a.shape()[0];
214
215 // if C is in C-layout, compute the transpose of the product in Fortran order
216 if constexpr (has_C_layout<C>) {
218 return;
219 } else {
220 // get underlying array in case it is given as a conjugate expression
221 auto arr_a = get_array(a);
222 auto arr_b = get_array(b);
223
224 // get views of the first matrix in the batch
225 auto a0 = arr_a(0, nda::range::all, nda::range::all);
226 auto b0 = arr_b(0, nda::range::all, nda::range::all);
227 auto c0 = c(0, nda::range::all, nda::range::all);
228
229 // check the dimensions of the input/output arrays/views
230 auto const [m, k] = a0.shape();
231 auto const [l, n] = b0.shape();
232 EXPECTS(k == l);
233 EXPECTS(m == c0.extent(0));
234 EXPECTS(n == c0.extent(1));
235
236 // arrays/views must be BLAS compatible
237 EXPECTS(arr_a.indexmap().min_stride() == 1);
238 EXPECTS(arr_b.indexmap().min_stride() == 1);
239 EXPECTS(c.indexmap().min_stride() == 1);
240
241 // perform the actual library call
243#if defined(NDA_HAVE_DEVICE)
244 device::gemm_batch_strided(get_op<A>, get_op<B>, m, n, k, alpha, arr_a.data(), get_ld(a0), arr_a.strides()[0], arr_b.data(), get_ld(b0),
245 arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], batch_count);
246#else
248#endif
249 } else {
250 f77::gemm_batch_strided(get_op<A>, get_op<B>, m, n, k, alpha, arr_a.data(), get_ld(a0), arr_a.strides()[0], arr_b.data(), get_ld(b0),
251 arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], batch_count);
252 }
253 }
254 }
255
257
258} // namespace nda::blas
Provides definitions and type traits involving the different memory address spaces supported by nda.
Provides a C++ interface for various BLAS routines.
ValueType const * data() const noexcept
Get a pointer to the actual data (in general this is not the beginning of the memory block for a view...
Check if a given type is an nda::MemoryArray of a certain rank.
Definition concepts.hpp:250
Provides concepts for the nda library.
Provides various convenient aliases and helper functions for nda::basic_array and nda::basic_array_vi...
Provides GPU and non-GPU specific functionality.
auto transposed_view(A &&a)
Transpose two indices/dimensions of an nda::basic_array or nda::basic_array_view.
auto transpose(A &&a)
Transpose the memory layout of an nda::MemoryArray or an nda::expr_call.
basic_array< ValueType, 1, C_layout, 'V', ContainerPolicy > vector
Alias template of an nda::basic_array with rank 1 and a 'V' algebra.
decltype(auto) get_first_element(A &&a)
Get the first element of an array/view or simply return the scalar if a scalar is given.
Definition traits.hpp:167
constexpr bool have_same_value_type_v
Constexpr variable that is true if all types in As have the same value type as A0.
Definition traits.hpp:186
std::decay_t< decltype(get_first_element(std::declval< A const >()))> get_value_t
Get the value type of an array/view or a scalar type.
Definition traits.hpp:182
static constexpr bool has_C_layout
Constexpr variable that is true if the given nda::Array type has nda::C_layout.
Definition tools.hpp:83
int get_ld(A const &a)
Get the leading dimension of an nda::MemoryArray with rank 1 or 2 for BLAS/LAPACK calls.
Definition tools.hpp:122
static constexpr bool is_conj_array_expr
Constexpr variable that is true if the given type is a conjugate lazy expression.
Definition tools.hpp:41
static constexpr char get_op
Variable template that determines the BLAS matrix operation tag ('N','T','C') based on the given bool...
Definition tools.hpp:98
MemoryArray decltype(auto) get_array(A &&a)
Get the underlying array of a conjugate lazy expression or return the array itself in case it is an n...
Definition tools.hpp:62
void gemm_batch_strided(get_value_t< A > alpha, A const &a, B const &b, get_value_t< A > beta, C &&c)
Interface to MKL's/CUDA's gemm_batch_strided routine.
void gemm_vbatch(get_value_t< A > alpha, std::vector< A > const &va, std::vector< B > const &vb, get_value_t< A > beta, std::vector< C > &vc)
Interface to MKL's/Magma's gemm_vbatch routine.
void gemm_batch(get_value_t< A > alpha, std::vector< A > const &va, std::vector< B > const &vb, get_value_t< A > beta, std::vector< C > &vc)
Interface to MKL's/CUDA's gemm_batch and gemm_vbatch routines.
static constexpr bool have_compatible_addr_space
Constexpr variable that is true if all given types have compatible address spaces.
static constexpr bool have_device_compatible_addr_space
Constexpr variable that is true if all given types have an address space compatible with Device.
static constexpr bool on_host
Constexpr variable that is true if all given types have a Host address space.
void compile_error_no_gpu()
Trigger a compilation error in case GPU specific functionality is used without configuring the projec...
Definition device.hpp:36
constexpr bool is_blas_lapack_v
Alias for nda::is_double_or_complex_v.
Definition traits.hpp:92
Provides functions to transform the memory layout of an nda::basic_array or nda::basic_array_view.
Macros used in the nda library.
Provides various traits and utilities for the BLAS interface.
Provides type traits for the nda library.