NVIDIA · sidelnik · Nov 4, 2024 · Nov 4, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -119,16 +119,18 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
     message(FATAL_ERROR "MatX requires CUDA 11.5 or higher. Please update before using.")
 endif()
 
+set(CCCL_ENABLE_UNSTABLE ON)
 message(STATUS "Finding CCCL...")
 rapids_cpm_cccl(
     BUILD_EXPORT_SET matx-exports
     INSTALL_EXPORT_SET matx-exports
 )
 
-target_link_libraries(matx INTERFACE CCCL::CCCL)
+target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax)
+set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda)
 
 # Set flags for compiling tests faster
-set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
+set(MATX_CUDA_FLAGS ${MATX_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
 
 # Hack because CMake doesn't have short circult evaluation
 if (NOT CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")

diff --git a/cmake/versions.json b/cmake/versions.json
@@ -1,10 +1,9 @@
 {
   "packages": {
     "CCCL": {
-      "version": "2.7.0-rc2",
-      "git_shallow": true,
+      "version": "2.8.0",
       "git_url": "https://github.com/NVIDIA/cccl.git",
-      "git_tag": "10e915ac7b79a1ab3b9d7a795c621b47b122f513"
+      "git_tag": "cb1fce5e1cb7362940bd7e74ab8fbf01942b6264"
     }
   }
 }
diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu
@@ -54,7 +54,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto norm = make_tensor<TypeParam, 1>({BATCH});
   auto maxn = make_tensor<TypeParam>({});
 
+#if 0
   cudaExecutor exec{};
+#else
+  stfExecutor exec{};
+  auto ctx = exec.getCtx();
+#endif
 
   // Simple Poisson matrix
   for(int b = 0; b < BATCH; b++) {
@@ -83,6 +88,10 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (maxn = matx::max(sqrt(norm))).run(exec);
 
   exec.sync();
+#if 1
+  ctx.finalize();
+#endif
+
   // example-end sync-test-1
   printf ("max l2 norm: %f\n", (float)sqrt(maxn()));
 

diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
@@ -73,7 +73,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
   using complex = cuda::std::complex<float>;
+#if 0
   cudaExecutor exec{};
+#else
+  stfExecutor exec{};
+  auto ctx = exec.getCtx();
+#endif
 
   index_t signal_size = 1ULL << 16;
   index_t filter_size = 16;
@@ -117,7 +122,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // Perform the FFT in-place on both signal and filter
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
+#if 0
       cudaEventRecord(start, stream);
+#else
+    cudaEventRecord(start, ctx.task_fence());
+#endif
     }    
     (sig_freq = fft(sig_time, filtered_size)).run(exec);
     (filt_freq = fft(filt_time, filtered_size)).run(exec);
@@ -129,18 +138,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   }
 
+#if 0
   cudaEventRecord(stop, stream);
+#else
+  cudaEventRecord(stop, ctx.task_fence());
+#endif
   exec.sync();
   cudaEventElapsedTime(&separate_ms, start, stop);   
 
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
-      cudaEventRecord(start, stream);
+#if 0
+        cudaEventRecord(start, stream);
+#else
+        cudaEventRecord(start, ctx.task_fence());
+#endif
     }
     (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec);
   }
-
+
+#if 0
   cudaEventRecord(stop, stream);
+#else
+  cudaEventRecord(stop, ctx.task_fence());
+#endif
   exec.sync();
   cudaEventElapsedTime(&fused_ms, start, stop);  
 
@@ -153,7 +174,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
 
   exec.sync();
-
+
+#if 1
+  ctx.finalize();
+#endif
+
   // Compare signals
   for (index_t b = 0; b < batches; b++) {
     for (index_t i = 0; i < filtered_size; i++) {
@@ -172,4 +197,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
-}
+}
diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu
@@ -76,6 +76,9 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     pipelines[s]->sync();  
   }
 
+  /* Get STF context handle */
+  auto ctx = pipelines[0]->exec.getCtx();
+
   MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2)
   printf("Running test...\n");
 
@@ -114,7 +117,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   for (uint32_t i = 0; i < iterations; i++) {
     for (int s = 0; s < num_streams; s++) {
       if (i == 1) {
+#if 0
         cudaEventRecord(starts[s], streams[s]);
+#else
+        cudaEventRecord(starts[s], ctx.task_fence());
+#endif
       }
 
       if (ENABLE_GRAPHS) {
@@ -127,9 +134,16 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }
 
   for (int s = 0; s < num_streams; s++) {
+#if 0
     cudaEventRecord(stops[s], streams[s]);
+#else
+    cudaEventRecord(stops[s], ctx.task_fence());
+#endif
     pipelines[s]->sync();
   }
+
+  ctx.finalize();
+
   MATX_NVTX_END_RANGE(2)
 
   MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3)

diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h
@@ -465,5 +465,11 @@ class RadarPipeline {
   tensor_t<typename ComplexType::value_type, 2> cfarMaskView;
 
   cudaStream_t stream;
+#if 0
   cudaExecutor exec;
+#else
+public:
+  stfExecutor exec;
+#endif
+
 };
diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h
@@ -120,7 +120,7 @@ namespace matx {
     __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::value_type **ptr) {
       const auto ttl_size = std::accumulate(shape.begin(), shape.end(), static_cast<index_t>(1),
                                   std::multiplies<index_t>()) * sizeof(typename TensorType::value_type);      
-      if constexpr (is_cuda_executor_v<Executor>) {
+      if constexpr (is_cuda_executor_v<Executor> || is_stf_executor_v<Executor>) {
         matxAlloc((void**)ptr, ttl_size, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
         make_tensor(tensor, *ptr, shape);
       }

diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h
@@ -91,6 +91,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
   using stride_container = typename Desc::stride_container;
   using desc_type = Desc; ///< Descriptor type trait
   using self_type = tensor_t<T, RANK, Storage, Desc>;
+  using stf_logicaldata_type = typename cuda::experimental::stf::logical_data<cuda::experimental::stf::void_interface>;
 
   /**
    * @brief Construct a new 0-D tensor t object
@@ -107,7 +108,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * @param rhs Object to copy from
    */
   __MATX_HOST__ tensor_t(tensor_t const &rhs) noexcept
-      : detail::tensor_impl_t<T, RANK, Desc>{rhs.ldata_, rhs.desc_}, storage_(rhs.storage_)
+      : detail::tensor_impl_t<T, RANK, Desc>{rhs.ldata_, rhs.desc_, rhs.stf_ldata_}, storage_(rhs.storage_)
       { }
 
   /**
@@ -116,7 +117,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * @param rhs Object to move from
    */
   __MATX_HOST__ tensor_t(tensor_t &&rhs) noexcept
-      : detail::tensor_impl_t<T, RANK, Desc>{rhs.ldata_, std::move(rhs.desc_)}, storage_(std::move(rhs.storage_))
+      : detail::tensor_impl_t<T, RANK, Desc>{rhs.ldata_, std::move(rhs.desc_), rhs.stf_ldata_}, storage_(std::move(rhs.storage_))
   { }
 
 
@@ -134,6 +135,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     this->ldata_ = rhs.ldata_;
     storage_ = rhs.storage_;
     this->desc_ = rhs.desc_;
+    this->stf_ldata_ = rhs.stf_ldata_;
   }
 
   /** Swaps two tensors
@@ -152,6 +154,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     std::swap(lhs.ldata_, rhs.ldata_);
     swap(lhs.storage_, rhs.storage_);
     swap(lhs.desc_, rhs.desc_);
+    std::swap(lhs.stf_ldata_, rhs.stf_ldata_);
   }
 
   __MATX_INLINE__  ~tensor_t() = default;
@@ -177,6 +180,16 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     this->SetLocalData(storage_.data());
   }
 
+  template <typename S2 = Storage, typename D2 = Desc,
+            std::enable_if_t<is_matx_storage_v<typename remove_cvref<S2>::type> && is_matx_descriptor_v<typename remove_cvref<D2>::type>, bool> = true>
+  tensor_t(S2 &&s, D2 &&desc, T* ldata, std::optional<stf_logicaldata_type > *stf_ldata_) :
+    detail::tensor_impl_t<T, RANK, Desc>{std::forward<D2>(desc)},
+    storage_{std::forward<S2>(s)}
+  {
+    this->stf_ldata_ = stf_ldata_;
+    this->SetLocalData(storage_.data());
+  }
+
   /**
    * @brief Construct a new tensor t object. Used to copy an existing storage object for proper reference counting
    *
@@ -185,13 +198,28 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
    * @param ldata
    */
   template <typename D2 = Desc>
-  tensor_t(Storage s, D2 &&desc, T* ldata) :
+  tensor_t(Storage s, D2 &&desc, T* ldata, std::optional<stf_logicaldata_type > *stf_ldata) :
     detail::tensor_impl_t<T, RANK, D2>{std::forward<D2>(desc)},
     storage_{std::move(s)}
   {
+    this->stf_ldata_ = stf_ldata;
     this->SetLocalData(ldata);
   }
 
+  /**
+   * @brief Construct a new tensor t object. Used to copy an existing storage object for proper reference counting
+   *
+   * @param s
+   * @param desc
+   * @param ldata
+   */
+  template <typename D2 = Desc>
+  tensor_t(Storage s, D2 &&desc, T* ldata) :
+    detail::tensor_impl_t<T, RANK, D2>{std::forward<D2>(desc)},
+    storage_{std::move(s)}
+  {
+    this->SetLocalData(ldata);
+  }
 
   /**
    * Constructor for a rank-1 and above tensor.
@@ -646,7 +674,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
     // Copy descriptor and call ctor with shape
     Desc new_desc{std::forward<Shape>(shape)};
-    return tensor_t<M, R, Storage, Desc>{storage_, std::move(new_desc), this->ldata_};
+    return tensor_t<M, R, Storage, Desc>{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_};
   }
 
   /**
@@ -705,7 +733,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
        "To get a reshaped view the tensor must be compact");
 
     DefaultDescriptor<tshape.size()> desc{std::move(tshape)};
-    return tensor_t<T, NRANK, Storage, decltype(desc)>{storage_, std::move(desc), this->ldata_};
+    return tensor_t<T, NRANK, Storage, decltype(desc)>{storage_, std::move(desc), this->ldata_, this->stf_ldata_};
   }
 
   /**
@@ -788,7 +816,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
     // Copy descriptor and call ctor with shape
     Desc new_desc{this->desc_.Shape(), std::move(strides)};
-    return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data};
+    return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data, this->stf_ldata_};
   }
 
   /**
@@ -831,7 +859,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     }
 
     Desc new_desc{this->desc_.Shape(), std::move(strides)};
-    return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data};
+    return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data, this->stf_ldata_};
   }
 
   /**
@@ -854,7 +882,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
     auto new_desc = this->PermuteImpl(dims);
-    return tensor_t<T, RANK, Storage, Desc>{storage_, std::move(new_desc), this->ldata_};
+    return tensor_t<T, RANK, Storage, Desc>{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_};
   }
 
 
@@ -1030,7 +1058,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
   OverlapView(const cuda::std::array<typename Desc::shape_type, N> &windows,
               const cuda::std::array<typename Desc::stride_type, N> &strides) const {
     auto new_desc = this->template OverlapViewImpl<N>(windows, strides);
-    return tensor_t<T, RANK + 1, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->ldata_};
+    return tensor_t<T, RANK + 1, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_};
   }
 
   /**
@@ -1064,7 +1092,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
     MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
 
     auto new_desc = this->template CloneImpl<N>(clones);
-    return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->ldata_};
+    return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_};
   }
 
   template <int N>
@@ -1362,7 +1390,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
                             [[maybe_unused]] StrideType strides) const
   {
     auto [new_desc, data] = this->template SliceImpl<N, StrideType>(firsts, ends, strides);
-    return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), data};
+    return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), data, this->stf_ldata_};
   }
 
   template <typename StrideType, int N = RANK>