NVIDIA · sidelnik · Nov 4, 2024 · Nov 4, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -119,16 +119,18 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
     message(FATAL_ERROR "MatX requires CUDA 11.5 or higher. Please update before using.")
 endif()
 
+set(CCCL_ENABLE_UNSTABLE ON)
 message(STATUS "Finding CCCL...")
 rapids_cpm_cccl(
     BUILD_EXPORT_SET matx-exports
     INSTALL_EXPORT_SET matx-exports
 )
 
-target_link_libraries(matx INTERFACE CCCL::CCCL)
+target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax)
+set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda)
 
 # Set flags for compiling tests faster
-set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
+set(MATX_CUDA_FLAGS ${MATX_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
 
 # Hack because CMake doesn't have short circult evaluation
 if (NOT CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")

diff --git a/cmake/versions.json b/cmake/versions.json
@@ -1,10 +1,9 @@
 {
   "packages": {
     "CCCL": {
-      "version": "2.7.0-rc2",
-      "git_shallow": true,
+      "version": "2.8.0",
       "git_url": "https://github.com/NVIDIA/cccl.git",
-      "git_tag": "10e915ac7b79a1ab3b9d7a795c621b47b122f513"
+      "git_tag": "36e27f7c1074010eefaab64d387ff6663569e065"
     }
   }
 }
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -11,6 +11,7 @@ set(examples
     mvdr_beamformer
     pwelch
     resample_poly_bench
+    simple_stf_test
     spectrogram
     spectrogram_graph
     spherical_harmonics

diff --git a/examples/cgsolve.cu b/examples/cgsolve.cu
@@ -54,7 +54,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   auto norm = make_tensor<TypeParam, 1>({BATCH});
   auto maxn = make_tensor<TypeParam>({});
 
+#if 0
   cudaExecutor exec{};
+#else
+  stfExecutor exec{};
+#endif
 
   // Simple Poisson matrix
   for(int b = 0; b < BATCH; b++) {
@@ -83,6 +87,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (maxn = matx::max(sqrt(norm))).run(exec);
 
   exec.sync();
+#if 1
+  auto ctx = exec.getCtx();
+  ctx.finalize();
+#endif
+
   // example-end sync-test-1
   printf ("max l2 norm: %f\n", (float)sqrt(maxn()));
 

diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
@@ -73,7 +73,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 {
   MATX_ENTER_HANDLER();
   using complex = cuda::std::complex<float>;
+#if 0
   cudaExecutor exec{};
+#else
+  stfExecutor exec{};
+  auto ctx = exec.getCtx();
+#endif
 
   index_t signal_size = 1ULL << 16;
   index_t filter_size = 16;
@@ -117,7 +122,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   // Perform the FFT in-place on both signal and filter
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
+#if 0
       cudaEventRecord(start, stream);
+#else
+    cudaEventRecord(start, ctx.task_fence());
+#endif
     }    
     (sig_freq = fft(sig_time, filtered_size)).run(exec);
     (filt_freq = fft(filt_time, filtered_size)).run(exec);
@@ -129,18 +138,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   }
 
+#if 0
   cudaEventRecord(stop, stream);
+#else
+  cudaEventRecord(stop, ctx.task_fence());
+#endif
   exec.sync();
   cudaEventElapsedTime(&separate_ms, start, stop);   
 
   for (int i = 0; i < iterations; i++) {
     if (i == 1) {
-      cudaEventRecord(start, stream);
+#if 0
+        cudaEventRecord(start, stream);
+#else
+        cudaEventRecord(start, ctx.task_fence());
+#endif
     }
     (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec);
   }
-
+
+#if 0
   cudaEventRecord(stop, stream);
+#else
+  cudaEventRecord(stop, ctx.task_fence());
+#endif
   exec.sync();
   cudaEventElapsedTime(&fused_ms, start, stop);  
 
@@ -153,7 +174,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
 
   exec.sync();
-
+
+#if 1
+  ctx.finalize();
+#endif
+
   // Compare signals
   for (index_t b = 0; b < batches; b++) {
     for (index_t i = 0; i < filtered_size; i++) {
@@ -172,4 +197,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
-}
+}
diff --git a/examples/simple_radar_pipeline.cu b/examples/simple_radar_pipeline.cu
@@ -39,31 +39,60 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   index_t numPulses = 128;
   index_t numSamples = 9000;
   index_t waveformLength = 1000;
-  constexpr bool ENABLE_GRAPHS = false;
   uint32_t iterations = 100;
-  constexpr int num_streams = 1;
-  cudaGraph_t graphs[num_streams];
-  cudaGraphExec_t instances[num_streams];  
-  using complex = cuda::std::complex<float>;
-  RadarPipeline<complex> *pipelines[num_streams];
+
+#if 0
+  constexpr int numStreams = 8;
+#else
+  int numStreams = 1;
+#endif
+
+    // Parse command-line arguments
+    for (int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+
+        if (arg == "--numChannels" && i + 1 < argc) {
+            numChannels = std::stoi(argv[++i]);
+        } else if (arg == "--numPulses" && i + 1 < argc) {
+            numPulses = std::stoi(argv[++i]);
+        } else if (arg == "--numSamples" && i + 1 < argc) {
+            numSamples = std::stoi(argv[++i]);
+        } else if (arg == "--waveformLength" && i + 1 < argc) {
+            waveformLength = std::stoi(argv[++i]);
+        } else if (arg == "--iterations" && i + 1 < argc) {
+            iterations = std::stoi(argv[++i]);
+        } else if (arg == "--numStreams" && i + 1 < argc) {
+            numStreams = std::stoi(argv[++i]);
+        } else {
+            std::cerr << "Unknown option or missing value: " << arg << std::endl;
+            return 1; // Exit with error
+        }
+    }
 
   std::cout << "Iterations: " << iterations << std::endl;
   std::cout << "numChannels: " << numChannels << std::endl;
   std::cout << "numPulses: " << numPulses << std::endl;
-  std::cout << "numNumSamples: " << numSamples << std::endl;
+  std::cout << "numSamples: " << numSamples << std::endl;
   std::cout << "waveformLength: " << waveformLength << std::endl;
+  std::cout << "numStreams: " << numStreams << std::endl;
+
+  constexpr bool ENABLE_GRAPHS = false;
+  cudaGraph_t graphs[numStreams];
+  cudaGraphExec_t instances[numStreams];  
+  using complex = cuda::std::complex<float>;
+  RadarPipeline<complex> *pipelines[numStreams];
 
   // cuda stream to place work in
-  cudaStream_t streams[num_streams];
+  cudaStream_t streams[numStreams];
 
   // manually set to log all NVTX levels
   MATX_NVTX_SET_LOG_LEVEL( matx_nvxtLogLevels::MATX_NVTX_LOG_ALL );
 
   // create some events for timing
-  cudaEvent_t starts[num_streams];
-  cudaEvent_t stops[num_streams];
+  cudaEvent_t starts[numStreams];
+  cudaEvent_t stops[numStreams];
 
-  for (int s = 0; s < num_streams; s++) {
+  for (int s = 0; s < numStreams; s++) {
     cudaEventCreate(&starts[s]);
     cudaEventCreate(&stops[s]);
     cudaStreamCreate(&streams[s]);
@@ -98,12 +127,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }; 
 
   // Warmup
-  for (int s = 0; s < num_streams; s++) {
+  for (int s = 0; s < numStreams; s++) {
     run_pipeline(s);
   }
 
   if (ENABLE_GRAPHS) {
-    for (int s = 0; s < num_streams; s++) {
+    for (int s = 0; s < numStreams; s++) {
       cudaStreamBeginCapture(streams[s], cudaStreamCaptureModeGlobal);
       run_pipeline(s);
       cudaStreamEndCapture(streams[s], &graphs[s]);
@@ -112,9 +141,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   }
 
   for (uint32_t i = 0; i < iterations; i++) {
-    for (int s = 0; s < num_streams; s++) {
+    for (int s = 0; s < numStreams; s++) {
       if (i == 1) {
+#ifdef USE_STF
+        auto ctx = pipelines[s]->exec.getCtx();
+        cudaEventRecord(starts[s], ctx.task_fence());
+#else
         cudaEventRecord(starts[s], streams[s]);
+#endif
       }
 
       if (ENABLE_GRAPHS) {
@@ -126,24 +160,37 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
     }
   }
 
-  for (int s = 0; s < num_streams; s++) {
+  for (int s = 0; s < numStreams; s++) {
+#ifdef USE_STF
+    auto ctx = pipelines[s]->exec.getCtx();
+    cudaEventRecord(stops[s], ctx.task_fence());
+#else
     cudaEventRecord(stops[s], streams[s]);
+#endif
     pipelines[s]->sync();
   }
+
+#ifdef USE_STF
+      for (int s = 0; s < numStreams; s++) {
+          auto ctx = pipelines[s]->exec.getCtx();
+          ctx.finalize();
+      }
+#endif
+
   MATX_NVTX_END_RANGE(2)
 
   MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3)
   float time_ms;
-  cudaEventElapsedTime(&time_ms, starts[num_streams-1], stops[num_streams-1]);
+  cudaEventElapsedTime(&time_ms, starts[numStreams-1], stops[numStreams-1]);
   float time_s = time_ms * .001f;
 
-  auto mult = iterations * numChannels * numPulses * num_streams;
+  auto mult = iterations * numChannels * numPulses * numStreams;
   printf("Pipeline finished in %.2fms, rate: %.2f pulses/channel/sec (%.2f Gbps)\n",
         time_ms,
          static_cast<float>(mult) / time_s,
          static_cast<float>(mult*sizeof(complex)*numSamples*8)/time_s/1e9);
 
-for (int s = 0; s < num_streams; s++) {
+for (int s = 0; s < numStreams; s++) {
     cudaEventDestroy(starts[s]);
     cudaEventDestroy(stops[s]);
     cudaStreamDestroy(streams[s]);

diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h
@@ -35,6 +35,10 @@
 #include <memory>
 #include <stdint.h>
 
+#ifndef USE_STF
+#define USE_STF 1
+#endif
+
 using namespace matx;
 
 /**
@@ -120,6 +124,7 @@ class RadarPipeline {
   RadarPipeline() = delete;
   ~RadarPipeline()
   {
+      std::cout << "DTOR for radar\n";
 
   }
 
@@ -137,6 +142,7 @@ class RadarPipeline {
       : numPulses(_numPulses), numSamples(_numSamples), waveformLength(_wfLen),
         numChannels(_numChannels), stream(_stream), exec(_stream)
   {
+      std::cout << "CTOR for pipeline\n";
     numSamplesRnd = 1;
     while (numSamplesRnd < numSamples) {
       numSamplesRnd *= 2;
@@ -465,5 +471,10 @@ class RadarPipeline {
   tensor_t<typename ComplexType::value_type, 2> cfarMaskView;
 
   cudaStream_t stream;
+#ifdef USE_STF
+public:
+  stfExecutor exec;
+#else
   cudaExecutor exec;
+#endif
 };