support for cgsolve operator and a few examples

NVIDIA · sidelnik · Nov 4, 2024 · Nov 4, 2024 · Nov 5, 2024 · Nov 5, 2024
commit c8ef988d1e1987f320c4be256dae36621b7c02f2
diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu
@@ -54,7 +54,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto norm = make_tensor<TypeParam, 1>({BATCH});
   auto maxn = make_tensor<TypeParam>({});
 
+#if 0
   cudaExecutor exec{};
+#else
+  stfExecutor exec{};
+  auto ctx = exec.getCtx();
+#endif
 
   // Simple Poisson matrix
   for(int b = 0; b < BATCH; b++) {
@@ -83,6 +88,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (maxn = matx::max(sqrt(norm))).run(exec);
 
   exec.sync();
+#if 1
+  ctx.finalize();
+#endif
+
   // example-end sync-test-1
   printf ("max l2 norm: %f\n", (float)sqrt(maxn()));
 

diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
@@ -73,7 +73,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
   using complex = cuda::std::complex<float>;
+#if 0
   cudaExecutor exec{};
+#else
+  stfExecutor exec{};
+  auto ctx = exec.getCtx();
+#endif
 
   index_t signal_size = 1ULL << 16;
   index_t filter_size = 16;
@@ -117,7 +122,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // Perform the FFT in-place on both signal and filter
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
+#if 0
       cudaEventRecord(start, stream);
+#else
+    cudaEventRecord(start, ctx.task_fence());
+#endif
     }    
     (sig_freq = fft(sig_time, filtered_size)).run(exec);
     (filt_freq = fft(filt_time, filtered_size)).run(exec);
@@ -129,18 +138,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   }
 
+#if 0
   cudaEventRecord(stop, stream);
+#else
+  cudaEventRecord(stop, ctx.task_fence());
+#endif
   exec.sync();
   cudaEventElapsedTime(&separate_ms, start, stop);   
 
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
-      cudaEventRecord(start, stream);
+#if 0
+        cudaEventRecord(start, stream);
+#else
+        cudaEventRecord(start, ctx.task_fence());
+#endif
     }
     (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec);
   }
-
+
+#if 0
   cudaEventRecord(stop, stream);
+#else
+  cudaEventRecord(stop, ctx.task_fence());
+#endif
   exec.sync();
   cudaEventElapsedTime(&fused_ms, start, stop);  
 
@@ -153,7 +174,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
 
   exec.sync();
-
+
+#if 1
+  ctx.finalize();
+#endif
+
   // Compare signals
   for (index_t b = 0; b < batches; b++) {
     for (index_t i = 0; i < filtered_size; i++) {
@@ -172,4 +197,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
-}
+}
diff --git a/include/matx/executors/stf.h b/include/matx/executors/stf.h
@@ -63,10 +63,12 @@ template <typename T> constexpr bool is_matx_set_op();
       stfExecutor(cudaStream_t stream) : stream_(stream) {
           cuda::experimental::stf::async_resources_handle handle;
           ctx_ = cuda::experimental::stf::stream_ctx(stream, handle);
+          //ctx_ = cuda::experimental::stf::graph_ctx(stream, handle);
       }
       stfExecutor(int stream) : stream_(reinterpret_cast<cudaStream_t>(stream)) {
           cuda::experimental::stf::async_resources_handle handle;
           ctx_ = cuda::experimental::stf::stream_ctx(reinterpret_cast<cudaStream_t>(stream), handle);
+          //ctx_ = cuda::experimental::stf::graph_ctx(reinterpret_cast<cudaStream_t>(stream), handle);
       }
 
       /**
@@ -75,6 +77,7 @@ template <typename T> constexpr bool is_matx_set_op();
        */
       stfExecutor() : stream_(0) {
           ctx_ = cuda::experimental::stf::stream_ctx();
+          //ctx_ = cuda::experimental::stf::graph_ctx();
       }
 
       /**

diff --git a/include/matx/operators/all.h b/include/matx/operators/all.h
@@ -72,9 +72,35 @@ namespace detail {
         return tmp_out_(indices...);
       };
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept {
+        /* Albert -- Scenario where the all() operator is on the RHS and sum has already 
+        run previously. So we make tmp_out have a read permission as it will be read from */
+        tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+      }
+
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
-        all_impl(cuda::std::get<0>(out), a_, ex);
+        auto output = cuda::std::get<0>(out);
+        // stfexecutor case
+        if constexpr (!is_cuda_executor_v<Executor>) {
+            auto ctx = ex.getCtx();
+            auto tsk = ctx.task();
+            tsk.set_symbol("all_task");
+
+            output.PreRun(out_dims_, std::forward<Executor>(ex));
+            output.apply_dep_to_task(tsk, 0);
+            a_.apply_dep_to_task(tsk, 1);
+
+            tsk->*[&](cudaStream_t s) {
+                auto exec = cudaExecutor(s);
+                all_impl(output, a_, exec);
+            };
+        }
+        // cudaExecutor case
+        else if constexpr (is_cuda_executor_v<Executor>) {
+            all_impl(output, a_, ex);
+        }
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()

diff --git a/include/matx/operators/cast.h b/include/matx/operators/cast.h
@@ -83,6 +83,14 @@ namespace matx
           return static_cast<NewType>(op_(indices...));
         }
 
+        template <typename Task>
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        {
+          if constexpr (is_matx_op<T>()) {
+            op_.apply_dep_to_task(std::forward<Task>(task), perm);
+          }
+        }
+
         template <typename ShapeType, typename Executor>
         __MATX_INLINE__ void PreRun(ShapeType &&shape, Executor &&ex) const noexcept
         {

diff --git a/include/matx/operators/cgsolve.h b/include/matx/operators/cgsolve.h
@@ -90,8 +90,8 @@ namespace matx
 
         template <typename Out, typename Executor>
         void Exec(Out &&out, Executor &&ex)  const{
-          static_assert(is_cuda_executor_v<Executor>, "cgsolve() only supports the CUDA executor currently");
-          cgsolve_impl(cuda::std::get<0>(out), a_, b_, tol_, max_iters_, ex.getStream());
+          //static_assert(is_cuda_executor_v<Executor>, "cgsolve() only supports the CUDA executor currently");
+          cgsolve_impl(cuda::std::get<0>(out), a_, b_, ex, tol_, max_iters_, ex.getStream());
         }
 
         template <typename ShapeType, typename Executor>

diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h
@@ -146,7 +146,7 @@ namespace matx
         }
 
         template <typename Task>
-        __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept
+        __MATX_INLINE__ void apply_dep_to_task(Task &&task, [[maybe_unused]] int perm=1) const noexcept
         {
           /* Scenario where the matvec() operator is on the RHS and op has already
              run previously. So we make tmp_out have a read permission as it will be read from */

diff --git a/include/matx/operators/max.h b/include/matx/operators/max.h
@@ -72,9 +72,35 @@ namespace detail {
         return tmp_out_(indices...);
       }
 
+      template <typename Task>
+      __MATX_INLINE__ void apply_dep_to_task(Task &&task, int perm=1) const noexcept {
+        /* Albert -- Scenario where the all() operator is on the RHS and sum has already 
+        run previously. So we make tmp_out have a read permission as it will be read from */
+        tmp_out_.apply_dep_to_task(std::forward<Task>(task), 1);
+      }
+
       template <typename Out, typename Executor>
       void Exec(Out &&out, Executor &&ex) const {
-        max_impl(cuda::std::get<0>(out), a_, ex);
+        auto output = cuda::std::get<0>(out);
+        // stfexecutor case
+        if constexpr (!is_cuda_executor_v<Executor>) {
+            auto ctx = ex.getCtx();
+            auto tsk = ctx.task();
+            tsk.set_symbol("max_task");
+
+            output.PreRun(out_dims_, std::forward<Executor>(ex));
+            output.apply_dep_to_task(tsk, 0);
+            a_.apply_dep_to_task(tsk, 1);
+
+            tsk->*[&](cudaStream_t s) {
+                auto exec = cudaExecutor(s);
+                max_impl(output, a_, exec);
+            };
+        }
+        // cudaExecutor case
+        else if constexpr (is_cuda_executor_v<Executor>) {
+            max_impl(output, a_, ex);
+        }
       }
 
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()

diff --git a/include/matx/transforms/cgsolve.h b/include/matx/transforms/cgsolve.h
@@ -58,8 +58,8 @@ namespace matx
    *   cuda Stream to execute on
    *
    */
-  template <typename XType, typename AType, typename BType>
-    __MATX_INLINE__ void cgsolve_impl(XType X, AType A, BType B, double tol=1e-6, int max_iters=4, cudaStream_t stream=0)
+  template <typename XType, typename AType, typename BType, typename Executor>
+    __MATX_INLINE__ void cgsolve_impl(XType X, AType A, BType B, Executor &&exec, double tol=1e-6, int max_iters=4, cudaStream_t stream=0)
     {
       using value_type = typename XType::value_type;
       const int VRANK = XType::Rank();
@@ -120,15 +120,19 @@ namespace matx
       auto pApc = clone<VRANK>(pAp, clone_shape);
 
       // A*X
-      (Ap = matvec(A, X)).run(stream);
+      //(Ap = matvec(A, X)).run(stream);
+      (Ap = matvec(A, X)).run(exec);
       // r0 = B - A*X   
       // p = r0 
-      (p = r0 = B - Ap).run(stream);  
+      //(p = r0 = B - Ap).run(stream);  
+      (p = r0 = B - Ap).run(exec);  
 
-      (r0r0 = sum(r0*r0)).run(stream);
+      //(r0r0 = sum(r0*r0)).run(stream);
+      (r0r0 = sum(r0*r0)).run(exec);
 
       if(tol>0.0f) {
-        (converged = matx::all(as_int(sqrt(r0r0) < tol))).run(stream);
+        //(converged = matx::all(as_int(sqrt(r0r0) < tol))).run(stream);
+        (converged = matx::all(as_int(sqrt(r0r0) < tol))).run(exec);
 
         cudaEventRecord(event, stream);
         cudaStreamWaitEvent(d2h, event);
@@ -137,10 +141,12 @@ namespace matx
       int i;
       for (i = 0 ; i < max_iters; i++) {
         // Ap = matvec(A, p) 
-        (Ap = matvec(A, p)).run(stream);
+        //(Ap = matvec(A, p)).run(stream);
+        (Ap = matvec(A, p)).run(exec);
 
         // pAp = dot(p,Ap)  
-        (pAp = sum(p*Ap)).run(stream);
+        //(pAp = sum(p*Ap)).run(stream);
+        (pAp = sum(p*Ap)).run(exec);
 
         // if pAp is zero then we have exactly numerically converged.
         // However, this is batched so we may iterate more.  Iterating
@@ -152,10 +158,12 @@ namespace matx
         auto updateOp = ( r1 = r0 - (r0r0c/pApc) * Ap,
              X = X + (r0r0c/pApc) * p);
 
-        (IF( pApc != value_type(0), updateOp)).run(stream);
+        //(IF( pApc != value_type(0), updateOp)).run(stream);
+        (IF( pApc != value_type(0), updateOp)).run(exec);
 
         // r1r1 = dot(r1, r1)
-        (r1r1 = sum(r1*r1)).run(stream);
+        //(r1r1 = sum(r1*r1)).run(stream);
+        (r1r1 = sum(r1*r1)).run(exec);
 
         if(tol>0.0f) {
           // copy convergence criteria to host.  
@@ -168,15 +176,17 @@ namespace matx
             break;
           }
 
-          (converged = matx::all(as_int(sqrt(r1r1) < tol))).run(stream);
+          //(converged = matx::all(as_int(sqrt(r1r1) < tol))).run(stream);
+          (converged = matx::all(as_int(sqrt(r1r1) < tol))).run(exec);
 
           cudaEventRecord(event, stream);
           cudaStreamWaitEvent(d2h, event);
         }
 
         // p = r1 + b * p 
         auto updateP = ( p = r1 + (r1r1c/r0r0c) * p);
-        (IF( pApc != value_type(0), updateP)).run(stream);
+        //(IF( pApc != value_type(0), updateP)).run(stream);
+        (IF( pApc != value_type(0), updateP)).run(exec);
 
         // Advance residual
         swap(r0r0, r1r1);