Skip to content

Tylera/gtc 2025 tutorials #900

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 101 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
101 commits
Select commit Hold shift + click to select a range
e590290
moving all 01 introduction tutorials to inlined examples
tylera-nvidia Jan 24, 2025
f7b6717
move notebook 2 to cell execution. run is currently broken
tylera-nvidia Jan 24, 2025
bbb37a9
updated transforms notebook to use cell based execution
tylera-nvidia Jan 24, 2025
6903ac4
adding script for notebook. has hardcoded paths
tylera-nvidia Jan 24, 2025
7825f95
adding empty notebooks for fusion/profiling
tylera-nvidia Jan 24, 2025
fb8e89a
Adding timing metrics to CUDA and host executors
cliffburdick Jan 27, 2025
c47e4b3
Fixed docs
cliffburdick Jan 29, 2025
82d5846
details added to notebook 1
tylera-nvidia Jan 29, 2025
c695d5a
adding incomplete books
tylera-nvidia Jan 29, 2025
9fa6c81
update profiling lab with content explaining API
tylera-nvidia Jan 30, 2025
9150c90
Merge branch 'timing' into tylera/gtc_2025_tutorials
tylera-nvidia Jan 30, 2025
e57459d
adding exec to all runs
tylera-nvidia Jan 30, 2025
5b19014
Merge remote-tracking branch 'origin/main' into tylera/gtc_2025_tutor…
tylera-nvidia Jan 30, 2025
e5d5a70
moving notebook to remove radar
tylera-nvidia Jan 30, 2025
585e5e6
cleanup
tylera-nvidia Jan 30, 2025
ba0651b
Initialize host cuRAND API when using host compiler
cliffburdick Feb 5, 2025
148c138
Merge remote-tracking branch 'origin/cliffburdick-patch-2' into tyler…
tylera-nvidia Feb 5, 2025
20c8104
Merge remote-tracking branch 'origin/main' into tylera/gtc_2025_tutor…
tylera-nvidia Feb 20, 2025
c6de2ba
updates for cleaning up and testing random fixes
tylera-nvidia Feb 21, 2025
2fd6368
Merge remote-tracking branch 'origin/main' into tylera/gtc_2025_tutor…
tylera-nvidia Feb 21, 2025
a52cf7c
cleanup for notebook1
tylera-nvidia Feb 21, 2025
006b8b5
removing checked in cell results
tylera-nvidia Feb 21, 2025
9b0a832
cleanup comments on notebook1
tylera-nvidia Feb 21, 2025
bf826bc
updates to better automate run script. fix for normcdf
tylera-nvidia Feb 24, 2025
794865c
removing custom function for now
tylera-nvidia Feb 24, 2025
c2a0534
updates for limitations section of fusion. mount entire scratch. and …
tylera-nvidia Feb 24, 2025
4fc9800
remove output for notebook 2
tylera-nvidia Feb 24, 2025
d3e204c
cleanup
tylera-nvidia Feb 24, 2025
99e7cdb
redefinition fix
tylera-nvidia Feb 24, 2025
a1428b1
Fix URL to creating tensors docs
dylan-eustice Feb 25, 2025
6577db9
Add section on tensor creation operators
dylan-eustice Feb 25, 2025
2134a8e
Updates to first lab
cliffburdick Feb 25, 2025
f5a2a7b
Allow run.sh to be run from anywhere
dylan-eustice Feb 28, 2025
c43550a
Move images into GTC folder
dylan-eustice Feb 28, 2025
0db2c89
Add samples for profiling
dylan-eustice Feb 28, 2025
1540335
Add kernel fusion example
dylan-eustice Feb 28, 2025
aa49036
Add lab 03 section on profiling
dylan-eustice Feb 28, 2025
404a288
Updates to notebook 1
cliffburdick Feb 28, 2025
a3299af
Merge branch 'tylera/gtc_2025_tutorials' of github.com:NVIDIA/MatX in…
tylera-nvidia Feb 28, 2025
125e7ec
Added missing image
cliffburdick Feb 28, 2025
e55299c
Merge branch 'tylera/gtc_2025_tutorials' of github.com:NVIDIA/MatX in…
tylera-nvidia Feb 28, 2025
df301e9
cleanup for 01
tylera-nvidia Feb 28, 2025
1e280ba
updates from dry run
tylera-nvidia Feb 28, 2025
930f302
moving all 01 introduction tutorials to inlined examples
tylera-nvidia Jan 24, 2025
e95cb1a
move notebook 2 to cell execution. run is currently broken
tylera-nvidia Jan 24, 2025
793ec9f
updated transforms notebook to use cell based execution
tylera-nvidia Jan 24, 2025
2c0f2ac
adding script for notebook. has hardcoded paths
tylera-nvidia Jan 24, 2025
f6fc957
adding empty notebooks for fusion/profiling
tylera-nvidia Jan 24, 2025
4dcdf2e
details added to notebook 1
tylera-nvidia Jan 29, 2025
e07ef74
adding incomplete books
tylera-nvidia Jan 29, 2025
71e8039
update profiling lab with content explaining API
tylera-nvidia Jan 30, 2025
eaff70e
adding exec to all runs
tylera-nvidia Jan 30, 2025
f9fd2f7
moving notebook to remove radar
tylera-nvidia Jan 30, 2025
9cac06b
cleanup
tylera-nvidia Jan 30, 2025
425589f
updates for cleaning up and testing random fixes
tylera-nvidia Feb 21, 2025
34767fb
cleanup for notebook1
tylera-nvidia Feb 21, 2025
226c5e5
removing checked in cell results
tylera-nvidia Feb 21, 2025
9c28a4d
cleanup comments on notebook1
tylera-nvidia Feb 21, 2025
9c73399
updates to better automate run script. fix for normcdf
tylera-nvidia Feb 24, 2025
f02dfcb
removing custom function for now
tylera-nvidia Feb 24, 2025
7d0380e
updates for limitations section of fusion. mount entire scratch. and …
tylera-nvidia Feb 24, 2025
4272ed6
remove output for notebook 2
tylera-nvidia Feb 24, 2025
09e8b78
cleanup
tylera-nvidia Feb 24, 2025
70af633
redefinition fix
tylera-nvidia Feb 24, 2025
da41579
Fix URL to creating tensors docs
dylan-eustice Feb 25, 2025
56ca87a
Add section on tensor creation operators
dylan-eustice Feb 25, 2025
f3e7f00
Updates to first lab
cliffburdick Feb 25, 2025
9cede20
Allow run.sh to be run from anywhere
dylan-eustice Feb 28, 2025
13fc97c
Move images into GTC folder
dylan-eustice Feb 28, 2025
b43ee46
Add samples for profiling
dylan-eustice Feb 28, 2025
e0cca4b
Add kernel fusion example
dylan-eustice Feb 28, 2025
5395200
Add lab 03 section on profiling
dylan-eustice Feb 28, 2025
60d858f
Updates to notebook 1
cliffburdick Feb 28, 2025
517d128
Added missing image
cliffburdick Feb 28, 2025
c157b65
cleanup for 01
tylera-nvidia Feb 28, 2025
a409d6a
updates from dry run
tylera-nvidia Feb 28, 2025
cca6426
Switched away from cling into a custom run command
cliffburdick Mar 3, 2025
a6c5811
Adding timing metrics to CUDA and host executors
cliffburdick Jan 27, 2025
b3cd319
Fixed docs
cliffburdick Jan 29, 2025
1e251fa
Updates to notebook 1
cliffburdick Feb 28, 2025
64cbd3b
Updated magic command and notebook 1
cliffburdick Mar 4, 2025
07e8c3b
Added more examples and bug fixes in magic script
cliffburdick Mar 4, 2025
83872ae
More updates
cliffburdick Mar 5, 2025
c2e7c6f
Update cuda.h
cliffburdick Mar 5, 2025
71d7af0
Minor fixes on scripts
cliffburdick Mar 5, 2025
f9ec657
accepting merges
tylera-nvidia Mar 7, 2025
e0af79d
Merge remote-tracking branch 'origin/main' into tylera/gtc_2025_tutor…
tylera-nvidia Mar 7, 2025
fd35b43
update to new magic command in notebook 2
tylera-nvidia Mar 7, 2025
4015210
clearing run cells
tylera-nvidia Mar 7, 2025
84921d7
Added solutions
cliffburdick Mar 7, 2025
219f2f9
Update 01_lab_intro.ipynb
cliffburdick Mar 7, 2025
43faa76
updates to move solutions
tylera-nvidia Mar 7, 2025
e57979a
Merge branch 'tylera/gtc_2025_tutorials' of github.com:NVIDIA/MatX in…
tylera-nvidia Mar 7, 2025
a8a4c4d
fixing markdown
tylera-nvidia Mar 7, 2025
e3f2262
Add cout line to indicate cell has compiled and run
dylan-eustice Mar 9, 2025
5a84d77
Updates to 03_lab_profiling.ipynb
dylan-eustice Mar 9, 2025
3da6773
Add backup Nsight reports
dylan-eustice Mar 9, 2025
6bd96d6
Add img files
dylan-eustice Mar 10, 2025
2ddcf9e
updates for language on fusion limitations
tylera-nvidia Mar 11, 2025
3d8d33d
Update 01_lab_intro.ipynb
cliffburdick Mar 17, 2025
c33a590
Update 01_C.ipynb
cliffburdick Mar 17, 2025
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions docs_input/basics/profiling.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
.. _profiling:

Profiling
#########

Profiling is a way to measure the performance of a program and to identify bottlenecks in your MatX application. Since
the method for profiling depends on the executor, each executor implements its own profiling mechanism. For example,
the CUDA executor can use events encapsulating the kernels it's profiling. The profiling is done through the executor
object rather than the `run` statement so that multiple `run`\s can be profiled together.

Profiling is done by calling the `start_timer()` method of the executor:

.. code-block:: cpp

exec.start_timer();

To stop the profiler, `stop_timer()` is called:

.. code-block:: cpp

exec.stop_timer();

Depending on the executor, `stop_timer()` may need to block for the operation to conplete on an asynchronous executor.

Once `stop_timer()` returns, the execution time between the timers can be retrieved by calling `get_time_ms()`:

.. code-block:: cpp

auto time = exec.get_time_ms();

In the above example `time` contains the runtime of everything executed between the `start_timer()` and `stop_timer()` calls. For
a CUDA executor this is the time between the beginning of the first kernel and the end of the last. For a CPU executor this is the CPU
time between the two calls.

.. note::
Profiling does not work a multi-threaded host executor currently

For a full example of profiling, see the `spectrogram` example.
7 changes: 3 additions & 4 deletions docs_input/build.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ Optional features of MatX that require downloading separate libraries use additi
be explicit about their requirements.

The MatX CMake build configuration is intented to help download any libraries for both the required and optional features.
The CPM_ build system is used to help with package management and version control. By default, CPM will fetch other packages
The CPM build system is used to help with package management and version control. By default, CPM will fetch other packages
from the internet. Alternatively, the option ``CPM_USE_LOCAL_PACKAGES`` can be used to point to local downloads in an air-gapped
or offline environment. Choosing local versions of packages uses the typical ``find_packages`` CMake search methods. Please see
the CPM_ documentation or the documentation for each package for more information.
the CPM documentation or the documentation for each package for more information.


System Requirements
Expand All @@ -27,8 +27,7 @@ for supported host compilers. Other requirements for optional components are lis
Required Third-party Dependencies
---------------------------------

- `CPM <https://github.com/cpm-cmake/CPM.cmake>`_ (* Included in the project source and does not require a separate download)
- `CCCL <https://github.com/NVIDIA/cccl>`_ 2.7.0+
- `CCCL <https://github.com/NVIDIA/cccl>`_ 2.7.0+ commit cbc6b9b or higher


Optional Third-party Dependencies
Expand Down
21 changes: 7 additions & 14 deletions examples/spectrogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,6 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)

cudaStream_t stream;
cudaStreamCreate(&stream);

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaExecutor exec{stream};

float fs = 10000;
Expand Down Expand Up @@ -96,23 +91,23 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
(time = linspace<0>(num_samps, 0.0f, static_cast<float>(N) - 1.0f) / fs)
.run(exec);
// mod = 500 * np.cos(2*np.pi*0.25*time)
(modulation = 500 * cos(2 * M_PI * 0.25 * time)).run(exec);
(modulation = 500.f * cos(2.f * static_cast<typename complex::value_type>(M_PI) * 0.25f * time)).run(exec);
// carrier = amp * np.sin(2*np.pi*3e3*time + modulation)
(carrier = amp * sin(2 * M_PI * 3000 * time + modulation)).run(exec);
(carrier = amp * sin(2.f * static_cast<typename complex::value_type>(M_PI) * 3000.f * time + modulation)).run(exec);
// noise = 0.01 * fs / 2 * np.random.randn(time.shape)
(noise = sqrt(0.01 * fs / 2) * random<float>({N}, NORMAL)).run(exec);
(noise = sqrt(0.01f * fs / 2.f) * random<float>({N}, NORMAL)).run(exec);
// noise *= np.exp(-time/5)
(noise = noise * exp(-1.0f * time / 5.0f)).run(exec);
// x = carrier + noise
(x = carrier + noise).run(exec);

for (uint32_t i = 0; i < num_iterations; i++) {
if (i == 2) { // Start timer on third loop to allow generation of plot
cudaEventRecord(start, stream);
exec.start_timer();
}

// DFT Sample Frequencies (rfftfreq)
(freqs = (1.0 / (static_cast<float>(nfft) * 1 / fs)) *
(freqs = (1.0f / (static_cast<float>(nfft) * 1.f / fs)) *
linspace<0>(half_win, 0.0f, static_cast<float>(nfft) / 2.0f))
.run(exec);

Expand Down Expand Up @@ -143,15 +138,13 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)

}

cudaEventRecord(stop, stream);
exec.stop_timer();
exec.sync();
cudaEventElapsedTime(&time_ms, start, stop);
time_ms = exec.get_time_ms();

printf("Spectrogram Time Without Graphs = %.2fus per iteration\n",
time_ms * 1e3 / num_iterations);

cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaStreamDestroy(stream);

MATX_CUDA_CHECK_LAST_ERROR();
Expand Down
39 changes: 36 additions & 3 deletions include/matx/executors/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,24 @@ namespace matx
*
* @param stream CUDA stream
*/
cudaExecutor(cudaStream_t stream) : stream_(stream) {}
cudaExecutor(int stream) : stream_(reinterpret_cast<cudaStream_t>(stream)) {}
cudaExecutor(cudaStream_t stream) : stream_(stream) {
MATX_CUDA_CHECK(cudaEventCreate(&start_));
MATX_CUDA_CHECK(cudaEventCreate(&stop_));
}

cudaExecutor(int stream) : stream_(reinterpret_cast<cudaStream_t>(stream)) {
MATX_CUDA_CHECK(cudaEventCreate(&start_));
MATX_CUDA_CHECK(cudaEventCreate(&stop_));
}

/**
* @brief Construct a new cudaExecutor object using the default stream
*
*/
cudaExecutor() : stream_(0) {}
cudaExecutor() : stream_(0) {
MATX_CUDA_CHECK(cudaEventCreate(&start_));
MATX_CUDA_CHECK(cudaEventCreate(&stop_));
}

/**
* @brief Returns stream associated with executor
Expand All @@ -73,6 +83,27 @@ namespace matx
*
*/
void sync() { cudaStreamSynchronize(stream_); }

/**
* @brief Start a timer for profiling workload
*/
void start_timer() { cudaEventRecord(start_, stream_); }

/**
* @brief Stop a timer for profiling workload
*/
void stop_timer() { cudaEventRecord(stop_, stream_); }

/**
* @brief Get the time in milliseconds between start_timer and stop_timer.
* This will block until the event is synchronized
*/
float get_time_ms() {
float time;
cudaEventSynchronize(stop_);
cudaEventElapsedTime(&time, start_, stop_);
return time;
}

/**
* Execute an operator on a device
Expand Down Expand Up @@ -139,6 +170,8 @@ namespace matx

private:
cudaStream_t stream_;
cudaEvent_t start_;
cudaEvent_t stop_;
};

};
27 changes: 27 additions & 0 deletions include/matx/executors/host.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,31 @@ class HostExecutor {
*/
void sync() {}

/**
* @brief Start a timer for profiling workload
*/
void start_timer() {
MATX_STATIC_ASSERT_STR(MODE == ThreadsMode::SINGLE, matxNotSupported, "Timer not supported in multi-threaded mode");
start_ = std::chrono::high_resolution_clock::now();
}

/**
* @brief Stop a timer for profiling workload
*/
void stop_timer() {
MATX_STATIC_ASSERT_STR(MODE == ThreadsMode::SINGLE, matxNotSupported, "Timer not supported in multi-threaded mode");
stop_ = std::chrono::high_resolution_clock::now();
}

/**
* @brief Get the time in milliseconds between start_timer and stop_timer.
* This will block until the event is synchronized
*/
float get_time_ms() {
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop_ - start_);
return static_cast<float>(static_cast<double>(duration.count()) / 1e3);
}

/**
* @brief Execute an operator
*
Expand Down Expand Up @@ -151,6 +176,8 @@ class HostExecutor {

private:
HostExecParams params_;
std::chrono::time_point<std::chrono::high_resolution_clock> start_;
std::chrono::time_point<std::chrono::high_resolution_clock> stop_;
};

using SingleThreadedHostExecutor = HostExecutor<ThreadsMode::SINGLE>;
Expand Down