#pragma once #include "cuda_err.cuh" #include "cuda_ptr.cuh" #include "rand.h" const size_t WARP_SIZE = 32; template __global__ void mat_mul_naive(const T *const __restrict__ mA, const T *const __restrict__ mB, T *const __restrict__ mC, const size_t n, const size_t m, const size_t k) { const auto row = blockIdx.y * blockDim.y + threadIdx.y; const auto col = blockIdx.x * blockDim.x + threadIdx.x; if (row < n && col < k) { T sum = 0; for (auto i = 0; i < m; i++) { sum += mA[row * m + i] * mB[i * k + col]; } mC[row * k + col] = sum; } } template __global__ void mat_mul_shared(const T *const __restrict__ mA, const T *const __restrict__ mB, T *const __restrict__ mC, const size_t n, const size_t m, const size_t k) { __shared__ T tA[WARP_SIZE][WARP_SIZE]; __shared__ T tB[WARP_SIZE][WARP_SIZE]; const auto tN = (m + WARP_SIZE - 1) / WARP_SIZE; const auto row = blockIdx.y * blockDim.y + threadIdx.y; const auto col = blockIdx.x * blockDim.x + threadIdx.x; T sum = 0; for (auto tI = 0; tI < tN; tI++) { const auto a_row = row; const auto a_col = tI * WARP_SIZE + threadIdx.x; if (a_row < n && a_col < m) { tA[threadIdx.y][threadIdx.x] = mA[a_row * m + a_col]; } else { tA[threadIdx.y][threadIdx.x] = 0.0; } const auto b_row = tI * WARP_SIZE + threadIdx.y; const auto b_col = col; if (b_row < m && b_col < k) { tB[threadIdx.y][threadIdx.x] = mB[b_row * k + b_col]; } else { tB[threadIdx.y][threadIdx.x] = 0.0; } __syncthreads(); for (auto i = 0; i < WARP_SIZE; i++) { sum += tA[threadIdx.y][i] * tB[i][threadIdx.x]; } __syncthreads(); } if (row < n && col < k) { mC[row * k + col] = sum; } } // template // __device__ __forceinline__ T // line_warp_reduce(const T *const __restrict__ tA_row, // const T *const __restrict__ tB_col, unsigned int mask) { // T sum = 0; // #pragma unroll // for (auto i = 0; i < WARP_SIZE; ++i) { // const T a_val = __shfl_sync(mask, tA_row[i], i); // const T b_val = __shfl_sync(mask, tB_col[i], i); // sum += a_val * b_val; // } // return sum; // } template __global__ void mat_mul_shared_with_warp_intrinsics( const T *const __restrict__ mA, const T *const __restrict__ mB, T *const __restrict__ mC, const size_t n, const size_t m, const size_t k) { __shared__ T tA[WARP_SIZE][WARP_SIZE]; __shared__ T tB[WARP_SIZE][WARP_SIZE]; const auto tN = (m + WARP_SIZE - 1) / WARP_SIZE; const auto row = blockIdx.y * blockDim.y + threadIdx.y; const auto col = blockIdx.x * blockDim.x + threadIdx.x; T sum = 0; for (auto tI = 0; tI < tN; tI++) { const auto a_row = row; const auto a_col = tI * WARP_SIZE + threadIdx.x; if (a_row < n && a_col < m) { tA[threadIdx.y][threadIdx.x] = mA[a_row * m + a_col]; } else { tA[threadIdx.y][threadIdx.x] = 0.0; } const auto b_row = tI * WARP_SIZE + threadIdx.y; const auto b_col = col; if (b_row < m && b_col < k) { tB[threadIdx.y][threadIdx.x] = mB[b_row * k + b_col]; } else { tB[threadIdx.y][threadIdx.x] = 0.0; } __syncthreads(); const T a_val_reg = tA[threadIdx.y][threadIdx.x]; const T b_val_reg = tB[threadIdx.y][threadIdx.x]; for (int k = 0; k < WARP_SIZE; ++k) { const T a_val = __shfl_sync(0xFFFFFFFF, a_val_reg, k); const T b_val = __shfl_sync(0xFFFFFFFF, b_val_reg, k); sum += a_val * b_val; } // for (auto i = 0; i < WARP_SIZE; i++) { // sum += line_warp_reduce(tA[threadIdx.y], &tB[0][threadIdx.x], 0xFFFFFFFF); // } __syncthreads(); } if (row < n && col < k) { mC[row * k + col] = sum; } } template __global__ void mat_transpose(const T *const __restrict__ mA, T *const __restrict__ mB, const size_t n, const size_t m) { const auto row = blockIdx.y * blockDim.y + threadIdx.y; const auto col = blockIdx.x * blockDim.x + threadIdx.x; if (row < m && col < n) { mB[row * n + col] = mA[col * m + row]; } } namespace Mat { template class Mat { public: explicit Mat() : data_(CudaPtr(N * M)) {} ~Mat() noexcept = default; Mat(const Mat &other) : data_(CudaPtr(N * M)) { data_.copy_from_cuda_ptr(other.data_); } Mat &operator=(const Mat &other) = delete; Mat(Mat &&other) noexcept : data_(std::move(other.data_)) {} Mat &operator=(Mat &&other) = delete; size_t rows() const noexcept { return N; } size_t cols() const noexcept { return M; } size_t size() const noexcept { return data_.size(); } size_t size_bytes() const noexcept { return data_.size_bytes(); } T *data() noexcept { return data_.get(); } const T *data() const noexcept { return data_.get(); } T &at(const size_t i, const size_t j) noexcept { return data_[i * M + j]; } const T &at(const size_t i, const size_t j) const noexcept { return data_[i * M + j]; } template Mat dot_naive(const Mat &other) const { Mat result; if (N == 0 || M == 0 || K == 0) { return result; } dim3 dimBlock(WARP_SIZE, WARP_SIZE); dim3 dimGrid((K + dimBlock.x - 1) / dimBlock.x, (N + dimBlock.y - 1) / dimBlock.y); CUDA_CHECK(cudaMemPrefetchAsync(data(), size_bytes(), 0)); CUDA_CHECK(cudaMemPrefetchAsync(other.data(), other.size_bytes(), 0)); mat_mul_naive<<>>(data(), other.data(), result.data(), N, M, K); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); return result; }; template Mat dot_shared(const Mat &other) const { Mat result; if (N == 0 || M == 0 || K == 0) { return result; } dim3 dimBlock(WARP_SIZE, WARP_SIZE); dim3 dimGrid((K + dimBlock.x - 1) / dimBlock.x, (N + dimBlock.y - 1) / dimBlock.y); CUDA_CHECK(cudaMemPrefetchAsync(data(), size_bytes(), 0)); CUDA_CHECK(cudaMemPrefetchAsync(other.data(), other.size_bytes(), 0)); mat_mul_shared<<>>(data(), other.data(), result.data(), N, M, K); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); return result; }; template Mat dot_shared_with_warp_intrinsics(const Mat &other) const { Mat result; if (N == 0 || M == 0 || K == 0) { return result; } dim3 dimBlock(WARP_SIZE, WARP_SIZE); dim3 dimGrid((K + dimBlock.x - 1) / dimBlock.x, (N + dimBlock.y - 1) / dimBlock.y); CUDA_CHECK(cudaMemPrefetchAsync(data(), size_bytes(), 0)); CUDA_CHECK(cudaMemPrefetchAsync(other.data(), other.size_bytes(), 0)); mat_mul_shared_with_warp_intrinsics<<>>( data(), other.data(), result.data(), N, M, K); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); return result; }; Mat transpose() const { Mat result; if (N == 0 || M == 0) { return result; } dim3 dimBlock(WARP_SIZE, WARP_SIZE); dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x, (M + dimBlock.y - 1) / dimBlock.y); CUDA_CHECK(cudaMemPrefetchAsync(data(), size_bytes(), 0)); transpose<<>>(data(), result.data(), N, M); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); return result; } static Mat random() { Mat mat; for (size_t i = 0; i < N * M; ++i) { mat.data_[i] = Random::getInstance().next(); } return mat; } private: CudaPtr data_; }; }; // namespace Mat