nda/latest/gemm__batch_8hpp_source.html

// Copyright (c) 2022-2023 Simons Foundation

//

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

//

//     http://www.apache.org/licenses/LICENSE-2.0.txt

//

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

//

// Authors: Miguel Morales, Nils Wentzell


#pragma once


#include "./interface/cxx_interface.hpp"

#include "./tools.hpp"

#include "../concepts.hpp"

#include "../declarations.hpp"

#include "../layout_transforms.hpp"

#include "../macros.hpp"

#include "../mem/address_space.hpp"

#include "../traits.hpp"


#ifndef NDA_HAVE_DEVICE

#include "../device.hpp"

#endif


#include <algorithm>

#include <iterator>

#include <tuple>

#include <type_traits>

#include <vector>


namespace nda::blas {


  template <bool VBATCH = false, Matrix A, Matrix B, MemoryMatrix C>

    requires((MemoryMatrix<A> or is_conj_array_expr<A>) and (MemoryMatrix<B> or is_conj_array_expr<B>)

             and have_same_value_type_v<A, B, C> and is_blas_lapack_v<get_value_t<A>>)


  void gemm_batch(get_value_t<A> alpha, std::vector<A> const &va, std::vector<B> const &vb, get_value_t<A> beta, std::vector<C> &vc) {

    // check sizes

    EXPECTS(va.size() == vb.size() and va.size() == vc.size());

    if (va.empty()) return;

    int batch_count = va.size();


    // get underlying matrix in case it is given as a lazy expression

    auto to_mat = []<typename Z>(Z &z) -> auto & {

      if constexpr (is_conj_array_expr<Z>)

        return std::get<0>(z.a);

      else

        return z;

    };

    auto &a0 = to_mat(va[0]);

    auto &b0 = to_mat(vb[0]);

    auto &c0 = vc[0];


    // compile-time checks

    using mat_a_type = decltype(a0);

    using mat_b_type = decltype(b0);

    static_assert(mem::have_compatible_addr_space<mat_a_type, mat_b_type, C>, "Error in nda::blas::gemm_batch: Incompatible memory address spaces");


    // c is in C order: compute the transpose of the product in Fortran order

    if constexpr (has_C_layout<C>) {

      // transpose each matrix in the given vector

      auto map_transpose = [](auto &v) {

        auto vt = std::vector<std::decay_t<decltype(transpose(v[0]))>>{};

        vt.reserve(v.size());

        std::transform(v.begin(), v.end(), std::back_inserter(vt), [](auto &x) { return transpose(x); });

        return vt;

      };

      auto vct = map_transpose(vc);

      gemm_batch<VBATCH>(alpha, map_transpose(vb), map_transpose(va), beta, vct);

      return;

    } else { // c is in Fortran order

      // for operations on the device, use unified memory for vector of ints or ptrs

      auto constexpr vec_adr_spc = []() { return mem::on_host<C> ? mem::Host : mem::Unified; }();


      // convert the vector of matrices into the associated vector of pointers

      auto get_ptrs = [&to_mat]<typename V>(V &v) {

        EXPECTS(std::all_of(v.begin(), v.end(),

                            [&v, &to_mat](auto &z) { return (VBATCH or z.shape() == v[0].shape()) and to_mat(z).indexmap().min_stride() == 1; }));

        using value_t = get_value_t<typename V::value_type>;

        using ptr_t   = std::conditional_t<std::is_const_v<V>, value_t const *, value_t *>;

        auto v_ptrs   = nda::vector<ptr_t, heap<vec_adr_spc>>(v.size());

        std::transform(v.begin(), v.end(), v_ptrs.begin(), [&to_mat](auto &z) { return to_mat(z).data(); });

        return v_ptrs;

      };

      auto a_ptrs = get_ptrs(va);

      auto b_ptrs = get_ptrs(vb);

      auto c_ptrs = get_ptrs(vc);


      // gather parameters for gemm call

      static constexpr bool conj_A = is_conj_array_expr<A>;

      static constexpr bool conj_B = is_conj_array_expr<B>;

      char op_a                    = get_op<conj_A, /* transpose = */ has_C_layout<mat_a_type>>;

      char op_b                    = get_op<conj_B, /* transpose = */ has_C_layout<mat_b_type>>;


      // matrices have different sizes

      if constexpr (VBATCH) {

        // create vectors of size 'batch_count + 1' as required by Magma

        nda::vector<int, heap<vec_adr_spc>> vm(batch_count + 1), vk(batch_count + 1), vn(batch_count + 1), vlda(batch_count + 1),

           vldb(batch_count + 1), vldc(batch_count + 1);


        for (auto i : range(batch_count)) {

          auto &ai = to_mat(va[i]);

          auto &bi = to_mat(vb[i]);

          auto &ci = vc[i];


          EXPECTS(ai.extent(1) == bi.extent(0));

          EXPECTS(ai.extent(0) == ci.extent(0));

          EXPECTS(bi.extent(1) == ci.extent(1));


          vm[i] = ai.extent(0);

          vk[i] = ai.extent(1);

          vn[i] = bi.extent(1);


          vlda[i] = get_ld(ai);

          vldb[i] = get_ld(bi);

          vldc[i] = get_ld(ci);

        }


        if constexpr (mem::have_device_compatible_addr_space<mat_a_type, mat_b_type, C>) {

#if defined(NDA_HAVE_DEVICE)

          device::gemm_vbatch(op_a, op_b, vm.data(), vn.data(), vk.data(), alpha, a_ptrs.data(), vlda.data(), b_ptrs.data(), vldb.data(), beta,

                              c_ptrs.data(), vldc.data(), batch_count);

#else

          compile_error_no_gpu();

#endif

        } else {

          f77::gemm_vbatch(op_a, op_b, vm.data(), vn.data(), vk.data(), alpha, a_ptrs.data(), vlda.data(), b_ptrs.data(), vldb.data(), beta,

                           c_ptrs.data(), vldc.data(), batch_count);

        }

      } else {

        // all matrices have the same size

        EXPECTS(a0.extent(1) == b0.extent(0));

        EXPECTS(a0.extent(0) == c0.extent(0));

        EXPECTS(b0.extent(1) == c0.extent(1));


        auto [m, k] = a0.shape();

        auto n      = b0.extent(1);


        if constexpr (mem::have_device_compatible_addr_space<mat_a_type, mat_b_type, C>) {

#if defined(NDA_HAVE_DEVICE)

          device::gemm_batch(op_a, op_b, m, n, k, alpha, a_ptrs.data(), get_ld(a0), b_ptrs.data(), get_ld(b0), beta, c_ptrs.data(), get_ld(c0),

                             batch_count);

#else

          compile_error_no_gpu();

#endif

        } else {

          f77::gemm_batch(op_a, op_b, m, n, k, alpha, a_ptrs.data(), get_ld(a0), b_ptrs.data(), get_ld(b0), beta, c_ptrs.data(), get_ld(c0),

                          batch_count);

        }

      }

    }

  }


  template <Matrix A, Matrix B, MemoryMatrix C>


  void gemm_vbatch(get_value_t<A> alpha, std::vector<A> const &va, std::vector<B> const &vb, get_value_t<A> beta, std::vector<C> &vc) {

    gemm_batch</* VBATCH = */ true>(alpha, va, vb, beta, vc);

  }


  template <ArrayOfRank<3> A, ArrayOfRank<3> B, MemoryArrayOfRank<3> C>

    requires((MemoryArrayOfRank<A, 3> or (is_conj_array_expr<A>)) and (MemoryArrayOfRank<B, 3> or (is_conj_array_expr<B>))

             and have_same_value_type_v<A, B, C> and is_blas_lapack_v<get_value_t<A>>)


  void gemm_batch_strided(get_value_t<A> alpha, A const &a, B const &b, get_value_t<A> beta, C &&c) {

    // check number of matrices

    EXPECTS(a.shape()[0] == b.shape()[0] and a.shape()[0] == c.shape()[0]);


    // get underlying array in case it is given as a lazy expression

    auto to_arr = []<typename Z>(Z &z) -> auto & {

      if constexpr (is_conj_array_expr<Z>)

        return std::get<0>(z.a);

      else

        return z;

    };

    auto arr_a = to_arr(a);

    auto arr_b = to_arr(b);


    // compile-time check

    using arr_a_type = decltype(arr_a);

    using arr_b_type = decltype(arr_b);

    static_assert(mem::have_compatible_addr_space<arr_a_type, arr_b_type, C>,

                  "Error in nda::blas::gemm_batch_strided: Incompatible memory address spaces");


    // runtime checks

    auto _  = nda::range::all;

    auto a0 = arr_a(0, _, _);

    auto b0 = arr_b(0, _, _);

    auto c0 = c(0, _, _);

    EXPECTS(a0.extent(1) == b0.extent(0));

    EXPECTS(a0.extent(0) == c0.extent(0));

    EXPECTS(b0.extent(1) == c0.extent(1));

    EXPECTS(arr_a.indexmap().min_stride() == 1);

    EXPECTS(arr_b.indexmap().min_stride() == 1);

    EXPECTS(c.indexmap().min_stride() == 1);


    // c is in C order: compute the transpose of the product in Fortran order

    if constexpr (has_C_layout<C>) {

      gemm_batch_strided(alpha, transposed_view<1, 2>(b), transposed_view<1, 2>(a), beta, transposed_view<1, 2>(std::forward<C>(c)));

      return;

    } else { // c is in Fortran order

      static constexpr bool conj_A = is_conj_array_expr<A>;

      static constexpr bool conj_B = is_conj_array_expr<B>;

      char op_a                    = get_op<conj_A, /* transpose = */ has_C_layout<arr_a_type>>;

      char op_b                    = get_op<conj_B, /* transpose = */ has_C_layout<arr_b_type>>;

      auto [m, k]                  = a0.shape();

      auto n                       = b0.extent(1);


      if constexpr (mem::have_device_compatible_addr_space<arr_a_type, arr_b_type, C>) {

#if defined(NDA_HAVE_DEVICE)

        device::gemm_batch_strided(op_a, op_b, m, n, k, alpha, arr_a.data(), get_ld(a0), arr_a.strides()[0], arr_b.data(), get_ld(b0),

                                   arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], arr_a.extent(0));

#else

        compile_error_no_gpu();

#endif

      } else {

        f77::gemm_batch_strided(op_a, op_b, m, n, k, alpha, arr_a.data(), get_ld(a0), arr_a.strides()[0], arr_b.data(), get_ld(b0),

                                arr_b.strides()[0], beta, c.data(), get_ld(c0), c.strides()[0], arr_a.extent(0));

      }

    }

  }


} // namespace nda::blas

address_space.hpp
Provides definitions and type traits involving the different memory address spaces supported by nda.

cxx_interface.hpp
Provides a C++ interface for various BLAS routines.

nda::basic_array::data
ValueType const * data() const noexcept
Get a pointer to the actual data (in general this is not the beginning of the memory block for a view...
Definition basic_array.hpp:647

nda::MemoryArrayOfRank
Check if a given type is an nda::MemoryArray of a certain rank.
Definition concepts.hpp:275

concepts.hpp
Provides concepts for the nda library.

declarations.hpp
Provides various convenient aliases and helper functions for nda::basic_array and nda::basic_array_vi...

device.hpp
Provides GPU and non-GPU specific functionality.

nda::transposed_view
auto transposed_view(A &&a)
Transpose two indices/dimensions of an nda::basic_array or nda::basic_array_view.
Definition layout_transforms.hpp:212

nda::transpose
auto transpose(A &&a)
Transpose the memory layout of an nda::MemoryArray or an nda::expr_call.
Definition layout_transforms.hpp:188

nda::vector
basic_array< ValueType, 1, C_layout, 'V', ContainerPolicy > vector
Alias template of an nda::basic_array with rank 1 and a 'V' algebra.
Definition declarations.hpp:156

nda::have_same_value_type_v
constexpr bool have_same_value_type_v
Constexpr variable that is true if all types in As have the same value type as A0.
Definition traits.hpp:196

nda::get_value_t
std::decay_t< decltype(get_first_element(std::declval< A const  >()))> get_value_t
Get the value type of an array/view or a scalar type.
Definition traits.hpp:192

nda::blas::get_ld
int get_ld(A const &a)
Get the leading dimension in LAPACK jargon of an nda::MemoryMatrix.
Definition tools.hpp:109

nda::blas::gemm_batch_strided
void gemm_batch_strided(get_value_t< A > alpha, A const &a, B const &b, get_value_t< A > beta, C &&c)
Implements a strided batched version of nda::blas::gemm taking 3-dimensional arrays as arguments.
Definition gemm_batch.hpp:221

nda::blas::has_C_layout
static constexpr bool has_C_layout
Constexpr variable that is true if the given nda::Array type has a C memory layout.
Definition tools.hpp:76

nda::blas::gemm_vbatch
void gemm_vbatch(get_value_t< A > alpha, std::vector< A > const &va, std::vector< B > const &vb, get_value_t< A > beta, std::vector< C > &vc)
Wrapper of nda::blas::gemm_batch that allows variable sized matrices.
Definition gemm_batch.hpp:199

nda::blas::is_conj_array_expr
static constexpr bool is_conj_array_expr
Constexpr variable that is true if the given type is a conjugate lazy expression.
Definition tools.hpp:52

nda::blas::gemm_batch
void gemm_batch(get_value_t< A > alpha, std::vector< A > const &va, std::vector< B > const &vb, get_value_t< A > beta, std::vector< C > &vc)
Implements a batched version of nda::blas::gemm taking vectors of matrices as arguments.
Definition gemm_batch.hpp:69

nda::blas::get_op
const char get_op
Variable template that determines the BLAS matrix operation tag ('N','T','C') based on the given bool...
Definition tools.hpp:91

nda::mem::have_compatible_addr_space
static constexpr bool have_compatible_addr_space
Constexpr variable that is true if all given types have compatible address spaces.
Definition address_space.hpp:186

nda::mem::have_device_compatible_addr_space
static constexpr bool have_device_compatible_addr_space
Constexpr variable that is true if all given types have an address space compatible with Device.
Definition address_space.hpp:182

nda::mem::on_host
static constexpr bool on_host
Constexpr variable that is true if all given types have a Host address space.
Definition address_space.hpp:160

nda::compile_error_no_gpu
void compile_error_no_gpu()
Trigger a compilation error in case GPU specific functionality is used without configuring the projec...
Definition device.hpp:47

nda::is_blas_lapack_v
constexpr bool is_blas_lapack_v
Alias for nda::is_double_or_complex_v.
Definition traits.hpp:102

layout_transforms.hpp
Provides functions to transform the memory layout of an nda::basic_array or nda::basic_array_view.

macros.hpp
Macros used in the nda library.

tools.hpp
Provides various traits and utilities for the BLAS interface.

traits.hpp
Provides type traits for the nda library.