New paste Repaste Download
#pragma once
#include "cuda_err.cuh"
#include "cuda_ptr.cuh"
#include "rand.h"
const size_t WARP_SIZE = 32;
template <typename T>
__global__ void mat_mul_naive(const T *const __restrict__ mA,
                              const T *const __restrict__ mB,
                              T *const __restrict__ mC, const size_t n,
                              const size_t m, const size_t k) {
  const auto row = blockIdx.y * blockDim.y + threadIdx.y;
  const auto col = blockIdx.x * blockDim.x + threadIdx.x;
  if (row < n && col < k) {
    T sum = 0;
    for (auto i = 0; i < m; i++) {
      sum += mA[row * m + i] * mB[i * k + col];
    }
    mC[row * k + col] = sum;
  }
}
template <typename T>
__global__ void mat_mul_shared(const T *const __restrict__ mA,
                               const T *const __restrict__ mB,
                               T *const __restrict__ mC, const size_t n,
                               const size_t m, const size_t k) {
  __shared__ T tA[WARP_SIZE][WARP_SIZE];
  __shared__ T tB[WARP_SIZE][WARP_SIZE];
  const auto tN = (m + WARP_SIZE - 1) / WARP_SIZE;
  const auto row = blockIdx.y * blockDim.y + threadIdx.y;
  const auto col = blockIdx.x * blockDim.x + threadIdx.x;
  T sum = 0;
  for (auto tI = 0; tI < tN; tI++) {
    const auto a_row = row;
    const auto a_col = tI * WARP_SIZE + threadIdx.x;
    if (a_row < n && a_col < m) {
      tA[threadIdx.y][threadIdx.x] = mA[a_row * m + a_col];
    } else {
      tA[threadIdx.y][threadIdx.x] = 0.0;
    }
    const auto b_row = tI * WARP_SIZE + threadIdx.y;
    const auto b_col = col;
    if (b_row < m && b_col < k) {
      tB[threadIdx.y][threadIdx.x] = mB[b_row * k + b_col];
    } else {
      tB[threadIdx.y][threadIdx.x] = 0.0;
    }
    __syncthreads();
    for (auto i = 0; i < WARP_SIZE; i++) {
      sum += tA[threadIdx.y][i] * tB[i][threadIdx.x];
    }
    __syncthreads();
  }
  if (row < n && col < k) {
    mC[row * k + col] = sum;
  }
}
// template <typename T>
// __device__ __forceinline__ T
// line_warp_reduce(const T *const __restrict__ tA_row,
//                  const T *const __restrict__ tB_col, unsigned int mask) {
//   T sum = 0;
// #pragma unroll
//   for (auto i = 0; i < WARP_SIZE; ++i) {
//     const T a_val = __shfl_sync(mask, tA_row[i], i);
//     const T b_val = __shfl_sync(mask, tB_col[i], i);
//     sum += a_val * b_val;
//   }
//   return sum;
// }
template <typename T>
__global__ void mat_mul_shared_with_warp_intrinsics(
    const T *const __restrict__ mA, const T *const __restrict__ mB,
    T *const __restrict__ mC, const size_t n, const size_t m, const size_t k) {
  __shared__ T tA[WARP_SIZE][WARP_SIZE];
  __shared__ T tB[WARP_SIZE][WARP_SIZE];
  const auto tN = (m + WARP_SIZE - 1) / WARP_SIZE;
  const auto row = blockIdx.y * blockDim.y + threadIdx.y;
  const auto col = blockIdx.x * blockDim.x + threadIdx.x;
  T sum = 0;
  for (auto tI = 0; tI < tN; tI++) {
    const auto a_row = row;
    const auto a_col = tI * WARP_SIZE + threadIdx.x;
    if (a_row < n && a_col < m) {
      tA[threadIdx.y][threadIdx.x] = mA[a_row * m + a_col];
    } else {
      tA[threadIdx.y][threadIdx.x] = 0.0;
    }
    const auto b_row = tI * WARP_SIZE + threadIdx.y;
    const auto b_col = col;
    if (b_row < m && b_col < k) {
      tB[threadIdx.y][threadIdx.x] = mB[b_row * k + b_col];
    } else {
      tB[threadIdx.y][threadIdx.x] = 0.0;
    }
    __syncthreads();
    const T a_val_reg = tA[threadIdx.y][threadIdx.x];
    const T b_val_reg = tB[threadIdx.y][threadIdx.x];
    for (int k = 0; k < WARP_SIZE; ++k) {
        const T a_val = __shfl_sync(0xFFFFFFFF, a_val_reg, k);
        const T b_val = __shfl_sync(0xFFFFFFFF, b_val_reg, k);
        sum += a_val * b_val;
    }
    // for (auto i = 0; i < WARP_SIZE; i++) {
    //   sum += line_warp_reduce(tA[threadIdx.y], &tB[0][threadIdx.x], 0xFFFFFFFF);
    // }
    __syncthreads();
  }
  if (row < n && col < k) {
    mC[row * k + col] = sum;
  }
}
template <typename T>
__global__ void mat_transpose(const T *const __restrict__ mA,
                              T *const __restrict__ mB, const size_t n,
                              const size_t m) {
  const auto row = blockIdx.y * blockDim.y + threadIdx.y;
  const auto col = blockIdx.x * blockDim.x + threadIdx.x;
  if (row < m && col < n) {
    mB[row * n + col] = mA[col * m + row];
  }
}
namespace Mat {
template <typename T, size_t N, size_t M> class Mat {
public:
  explicit Mat() : data_(CudaPtr<T>(N * M)) {}
  ~Mat() noexcept = default;
  Mat(const Mat &other) : data_(CudaPtr<T>(N * M)) {
    data_.copy_from_cuda_ptr(other.data_);
  }
  Mat &operator=(const Mat &other) = delete;
  Mat(Mat &&other) noexcept : data_(std::move(other.data_)) {}
  Mat &operator=(Mat &&other) = delete;
  size_t rows() const noexcept { return N; }
  size_t cols() const noexcept { return M; }
  size_t size() const noexcept { return data_.size(); }
  size_t size_bytes() const noexcept { return data_.size_bytes(); }
  T *data() noexcept { return data_.get(); }
  const T *data() const noexcept { return data_.get(); }
  T &at(const size_t i, const size_t j) noexcept { return data_[i * M + j]; }
  const T &at(const size_t i, const size_t j) const noexcept {
    return data_[i * M + j];
  }
  template <size_t K> Mat<T, N, K> dot_naive(const Mat<T, M, K> &other) const {
    Mat<T, N, K> result;
    if (N == 0 || M == 0 || K == 0) {
      return result;
    }
    dim3 dimBlock(WARP_SIZE, WARP_SIZE);
    dim3 dimGrid((K + dimBlock.x - 1) / dimBlock.x,
                 (N + dimBlock.y - 1) / dimBlock.y);
    CUDA_CHECK(cudaMemPrefetchAsync(data(), size_bytes(), 0));
    CUDA_CHECK(cudaMemPrefetchAsync(other.data(), other.size_bytes(), 0));
    mat_mul_naive<<<dimGrid, dimBlock>>>(data(), other.data(), result.data(), N,
                                         M, K);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
    return result;
  };
  template <size_t K> Mat<T, N, K> dot_shared(const Mat<T, M, K> &other) const {
    Mat<T, N, K> result;
    if (N == 0 || M == 0 || K == 0) {
      return result;
    }
    dim3 dimBlock(WARP_SIZE, WARP_SIZE);
    dim3 dimGrid((K + dimBlock.x - 1) / dimBlock.x,
                 (N + dimBlock.y - 1) / dimBlock.y);
    CUDA_CHECK(cudaMemPrefetchAsync(data(), size_bytes(), 0));
    CUDA_CHECK(cudaMemPrefetchAsync(other.data(), other.size_bytes(), 0));
    mat_mul_shared<<<dimGrid, dimBlock>>>(data(), other.data(), result.data(),
                                          N, M, K);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
    return result;
  };
  template <size_t K>
  Mat<T, N, K>
  dot_shared_with_warp_intrinsics(const Mat<T, M, K> &other) const {
    Mat<T, N, K> result;
    if (N == 0 || M == 0 || K == 0) {
      return result;
    }
    dim3 dimBlock(WARP_SIZE, WARP_SIZE);
    dim3 dimGrid((K + dimBlock.x - 1) / dimBlock.x,
                 (N + dimBlock.y - 1) / dimBlock.y);
    CUDA_CHECK(cudaMemPrefetchAsync(data(), size_bytes(), 0));
    CUDA_CHECK(cudaMemPrefetchAsync(other.data(), other.size_bytes(), 0));
    mat_mul_shared_with_warp_intrinsics<<<dimGrid, dimBlock>>>(
        data(), other.data(), result.data(), N, M, K);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
    return result;
  };
  Mat<T, M, N> transpose() const {
    Mat<T, M, N> result;
    if (N == 0 || M == 0) {
      return result;
    }
    dim3 dimBlock(WARP_SIZE, WARP_SIZE);
    dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x,
                 (M + dimBlock.y - 1) / dimBlock.y);
    CUDA_CHECK(cudaMemPrefetchAsync(data(), size_bytes(), 0));
    transpose<<<dimGrid, dimBlock>>>(data(), result.data(), N, M);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaDeviceSynchronize());
    return result;
  }
  static Mat<T, N, M> random() {
    Mat<T, N, M> mat;
    for (size_t i = 0; i < N * M; ++i) {
      mat.data_[i] = Random::getInstance().next();
    }
    return mat;
  }
private:
  CudaPtr<T> data_;
};
}; // namespace Mat
Filename: lib/mat.cuh. Size: 8kb. View raw, , hex, or download this file.

This paste expires on 2025-06-12 01:44:44.655039. Pasted through v1-api.