NVIDIA · cliffburdick · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/docs_input/basics/profiling.rst b/docs_input/basics/profiling.rst
@@ -0,0 +1,38 @@
+.. _profiling:
+
+Profiling
+#########
+
+Profiling is a way to measure the performance of a program and to identify bottlenecks in your MatX application. Since 
+the method for profiling depends on the executor, each executor implements its own profiling mechanism. For example, 
+the CUDA executor can use events encapsulating the kernels it's profiling. The profiling is done through the executor 
+object rather than the `run` statement so that multiple `run`\s can be profiled together.
+
+Profiling is done by calling the `start_timer()` method of the executor:
+
+.. code-block:: cpp
+
+    exec.start_timer();
+
+To stop the profiler, `stop_timer()` is called:
+
+.. code-block:: cpp
+
+    exec.stop_timer();
+
+Depending on the executor, `stop_timer()` may need to block for the operation to conplete on an asynchronous executor.
+
+Once `stop_timer()` returns, the execution time between the timers can be retrieved by calling `get_time_ms()`:
+
+.. code-block:: cpp
+
+    auto time = exec.get_time_ms();
+
+In the above example `time` contains the runtime of everything executed between the `start_timer()` and `stop_timer()` calls. For
+a CUDA executor this is the time between the beginning of the first kernel and the end of the last. For a CPU executor this is the CPU 
+time between the two calls.
+
+.. note::
+   Profiling does not work a multi-threaded host executor currently
+
+For a full example of profiling, see the `spectrogram` example.
diff --git a/docs_input/build.rst b/docs_input/build.rst
@@ -11,10 +11,10 @@ Optional features of MatX that require downloading separate libraries use additi
 be explicit about their requirements.
 
 The MatX CMake build configuration is intented to help download any libraries for both the required and optional features.
-The CPM_ build system is used to help with package management and version control. By default, CPM will fetch other packages
+The CPM build system is used to help with package management and version control. By default, CPM will fetch other packages
 from the internet. Alternatively, the option ``CPM_USE_LOCAL_PACKAGES`` can be used to point to local downloads in an air-gapped
 or offline environment. Choosing local versions of packages uses the typical ``find_packages`` CMake search methods. Please see 
-the CPM_ documentation or the documentation for each package for more information.
+the CPM documentation or the documentation for each package for more information.
 
 
 System Requirements
@@ -27,8 +27,7 @@ for supported host compilers. Other requirements for optional components are lis
 Required Third-party Dependencies
 ---------------------------------
 
-- `CPM <https://github.com/cpm-cmake/CPM.cmake>`_ (* Included in the project source and does not require a separate download)
-- `CCCL <https://github.com/NVIDIA/cccl>`_ 2.7.0+
+- `CCCL <https://github.com/NVIDIA/cccl>`_ 2.7.0+ commit cbc6b9b or higher
 
 
 Optional Third-party Dependencies

diff --git a/examples/spectrogram.cu b/examples/spectrogram.cu
@@ -60,11 +60,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-
   cudaExecutor exec{stream};
 
   float fs = 10000;
@@ -96,23 +91,23 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
   (time = linspace<0>(num_samps, 0.0f, static_cast<float>(N) - 1.0f) / fs)
       .run(exec);
   // mod = 500 * np.cos(2*np.pi*0.25*time)
-  (modulation = 500 * cos(2 * M_PI * 0.25 * time)).run(exec);
+  (modulation = 500.f * cos(2.f * static_cast<typename complex::value_type>(M_PI) * 0.25f * time)).run(exec);
   // carrier = amp * np.sin(2*np.pi*3e3*time + modulation)
-  (carrier = amp * sin(2 * M_PI * 3000 * time + modulation)).run(exec);
+  (carrier = amp * sin(2.f * static_cast<typename complex::value_type>(M_PI) * 3000.f * time + modulation)).run(exec);
   // noise = 0.01 * fs / 2 * np.random.randn(time.shape)
-  (noise = sqrt(0.01 * fs / 2) * random<float>({N}, NORMAL)).run(exec);
+  (noise = sqrt(0.01f * fs / 2.f) * random<float>({N}, NORMAL)).run(exec);
   // noise *= np.exp(-time/5)
   (noise = noise * exp(-1.0f * time / 5.0f)).run(exec);
   // x = carrier + noise
   (x = carrier + noise).run(exec);
 
   for (uint32_t i = 0; i < num_iterations; i++) {
     if (i == 2) { // Start timer on third loop to allow generation of plot
-      cudaEventRecord(start, stream);
+      exec.start_timer();
     }
 
     // DFT Sample Frequencies (rfftfreq)
-    (freqs = (1.0 / (static_cast<float>(nfft) * 1 / fs)) *
+    (freqs = (1.0f / (static_cast<float>(nfft) * 1.f / fs)) *
                linspace<0>(half_win, 0.0f, static_cast<float>(nfft) / 2.0f))
         .run(exec);
 
@@ -143,15 +138,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   }
 
-  cudaEventRecord(stop, stream);
+  exec.stop_timer();
   exec.sync();
-  cudaEventElapsedTime(&time_ms, start, stop);
+  time_ms = exec.get_time_ms();
 
   printf("Spectrogram Time Without Graphs = %.2fus per iteration\n",
          time_ms * 1e3 / num_iterations);
 
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
   cudaStreamDestroy(stream);
 
   MATX_CUDA_CHECK_LAST_ERROR();

diff --git a/include/matx/executors/cuda.h b/include/matx/executors/cuda.h
@@ -54,14 +54,24 @@ namespace matx
        * 
        * @param stream CUDA stream
        */
-      cudaExecutor(cudaStream_t stream) : stream_(stream) {}
-      cudaExecutor(int stream) : stream_(reinterpret_cast<cudaStream_t>(stream)) {}
+      cudaExecutor(cudaStream_t stream) : stream_(stream) {
+        MATX_CUDA_CHECK(cudaEventCreate(&start_));
+        MATX_CUDA_CHECK(cudaEventCreate(&stop_));
+      }
+
+      cudaExecutor(int stream) : stream_(reinterpret_cast<cudaStream_t>(stream)) {
+        MATX_CUDA_CHECK(cudaEventCreate(&start_));
+        MATX_CUDA_CHECK(cudaEventCreate(&stop_));
+      }
 
       /**
        * @brief Construct a new cudaExecutor object using the default stream
        * 
        */
-      cudaExecutor() : stream_(0) {}
+      cudaExecutor() : stream_(0) {
+        MATX_CUDA_CHECK(cudaEventCreate(&start_));
+        MATX_CUDA_CHECK(cudaEventCreate(&stop_));
+      }
 
       /**
        * @brief Returns stream associated with executor
@@ -73,6 +83,27 @@ namespace matx
        * 
        */
       void sync() { cudaStreamSynchronize(stream_); }
+
+      /**
+       * @brief Start a timer for profiling workload
+       */
+      void start_timer() { cudaEventRecord(start_, stream_); }
+
+      /**
+       * @brief Stop a timer for profiling workload
+       */      
+      void stop_timer() { cudaEventRecord(stop_, stream_); }
+
+      /**
+       * @brief Get the time in milliseconds between start_timer and stop_timer. 
+       * This will block until the event is synchronized
+       */
+      float get_time_ms() {
+        float time;
+        cudaEventSynchronize(stop_);
+        cudaEventElapsedTime(&time, start_, stop_);
+        return time;
+      }
 
       /**
        * Execute an operator on a device
@@ -139,6 +170,8 @@ namespace matx
 
     private:
       cudaStream_t stream_;
+      cudaEvent_t start_;
+      cudaEvent_t stop_;
   };
 
 };
diff --git a/include/matx/executors/host.h b/include/matx/executors/host.h
@@ -112,6 +112,31 @@ class HostExecutor {
      */
     void sync() {}
 
+    /**
+     * @brief Start a timer for profiling workload
+     */
+    void start_timer() { 
+      MATX_STATIC_ASSERT_STR(MODE == ThreadsMode::SINGLE, matxNotSupported, "Timer not supported in multi-threaded mode");
+      start_ = std::chrono::high_resolution_clock::now();
+     }
+
+    /**
+     * @brief Stop a timer for profiling workload
+     */      
+    void stop_timer() { 
+      MATX_STATIC_ASSERT_STR(MODE == ThreadsMode::SINGLE, matxNotSupported, "Timer not supported in multi-threaded mode");
+      stop_ = std::chrono::high_resolution_clock::now();
+    }
+
+    /**
+     * @brief Get the time in milliseconds between start_timer and stop_timer. 
+     * This will block until the event is synchronized
+     */
+    float get_time_ms() {
+      auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop_ - start_);
+      return static_cast<float>(static_cast<double>(duration.count()) / 1e3);
+    }    
+
     /**
      * @brief Execute an operator
      *
@@ -151,6 +176,8 @@ class HostExecutor {
 
     private:
       HostExecParams params_;
+      std::chrono::time_point<std::chrono::high_resolution_clock> start_;
+      std::chrono::time_point<std::chrono::high_resolution_clock> stop_;
 };
 
 using SingleThreadedHostExecutor = HostExecutor<ThreadsMode::SINGLE>;