Skip to content

Cudastf #794

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 41 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
d6dc01d
Update build config to pull CUDASTF
sidelnik Nov 4, 2024
245b20f
remove const expr
sidelnik Nov 4, 2024
9b35ec8
Updates to get basic cudastf functionality working with matx
sidelnik Nov 5, 2024
7d298d4
move to void_interface
sidelnik Nov 5, 2024
154b3f9
add stf executor
sidelnik Nov 5, 2024
c8ef988
support for cgsolve operator and a few examples
sidelnik Nov 5, 2024
52b18c9
make the sync() that is part of stfexecutor call ctx.task_fence()
sidelnik Dec 3, 2024
d726b10
fix typo
sidelnik Dec 3, 2024
5e7576c
Added test case
sidelnik Dec 17, 2024
1373699
Fixes to the sync
sidelnik Dec 17, 2024
92e7204
add support for cgsolve
sidelnik Dec 17, 2024
a608f3f
update to the simple radar code
sidelnik Dec 17, 2024
b062577
minor typo fix
sidelnik Dec 17, 2024
bbf9abc
update version of stf
sidelnik Dec 19, 2024
3e831ea
cleanup constexpr case for stfexecutor
sidelnik Dec 19, 2024
702fe79
cleanup constexpr case for stfexecutor
sidelnik Dec 19, 2024
5bfe21e
add conditional support for cudagraph to the stf executor
sidelnik Dec 19, 2024
f407256
update to latest cudastf
sidelnik Jan 9, 2025
221599c
switch to use logical token
sidelnik Jan 9, 2025
7a5bb6c
update parameters for radar code
sidelnik Jan 9, 2025
0c2432f
update to radar code to work with command line args
sidelnik Jan 9, 2025
3ae267b
cleanup to support different executor
sidelnik Jan 9, 2025
6a75794
cleanup radar code to emit stf and cuda versions
sidelnik Jan 24, 2025
f1facca
test script that runs simple radar with different input sizes. output…
sidelnik Jan 24, 2025
0199e75
enable cuda graphs as a command line argument enableGraphs
sidelnik Jan 24, 2025
39b16f4
add support for the random/randomOp generator
sidelnik Jan 27, 2025
9b7c4b0
get the basic spectrogram code working with stf
sidelnik Jan 27, 2025
f9e09f1
get spectrogram cudagraph code working with stf
sidelnik Jan 27, 2025
6c9a791
add assert in the case stream capture is turned on if creating a plan
sidelnik Feb 10, 2025
a1efd1c
Merge branch 'cudastf' into cudastf_latest
sidelnik Mar 19, 2025
6437eab
Merge pull request #2 from sidelnik/cudastf_latest
sidelnik Mar 19, 2025
bbb9aae
Apps using matx with stf should get these flags
caugonnet Mar 24, 2025
e13c9b6
fix constructor
caugonnet Mar 24, 2025
7244399
fix typo/bug
sidelnik Apr 21, 2025
66f6850
update to example code to fix compile error
sidelnik Apr 21, 2025
89e2a43
update to example code to fix compile error
sidelnik Apr 21, 2025
973886b
update test script for radar code
sidelnik Apr 21, 2025
92885e7
temp fix to the allocator dtor
sidelnik Apr 21, 2025
8607840
remove warning to work with latest stf
sidelnik Apr 21, 2025
14e0985
replace logical token with token
sidelnik Apr 21, 2025
92e04d5
update version to use cccl from main
sidelnik Apr 21, 2025
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,18 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
message(FATAL_ERROR "MatX requires CUDA 11.5 or higher. Please update before using.")
endif()

set(CCCL_ENABLE_UNSTABLE ON)
message(STATUS "Finding CCCL...")
rapids_cpm_cccl(
BUILD_EXPORT_SET matx-exports
INSTALL_EXPORT_SET matx-exports
)

target_link_libraries(matx INTERFACE CCCL::CCCL)
target_link_libraries(matx INTERFACE CCCL::CCCL CCCL::cudax)
set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda)

# Set flags for compiling tests faster
set(MATX_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)
set(MATX_CUDA_FLAGS ${MATX_CUDA_FLAGS} --threads 0 -ftemplate-backtrace-limit=0)

# Hack because CMake doesn't have short circult evaluation
if (NOT CMAKE_BUILD_TYPE OR "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
Expand Down
5 changes: 2 additions & 3 deletions cmake/versions.json
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
{
"packages": {
"CCCL": {
"version": "2.7.0-rc2",
"git_shallow": true,
"version": "2.8.0",
"git_url": "https://github.com/NVIDIA/cccl.git",
"git_tag": "10e915ac7b79a1ab3b9d7a795c621b47b122f513"
"git_tag": "36e27f7c1074010eefaab64d387ff6663569e065"
}
}
}
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ set(examples
mvdr_beamformer
pwelch
resample_poly_bench
simple_stf_test
spectrogram
spectrogram_graph
spherical_harmonics
Expand Down
9 changes: 9 additions & 0 deletions examples/cgsolve.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
auto norm = make_tensor<TypeParam, 1>({BATCH});
auto maxn = make_tensor<TypeParam>({});

#if 0
cudaExecutor exec{};
#else
stfExecutor exec{};
#endif

// Simple Poisson matrix
for(int b = 0; b < BATCH; b++) {
Expand Down Expand Up @@ -83,6 +87,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
(maxn = matx::max(sqrt(norm))).run(exec);

exec.sync();
#if 1
auto ctx = exec.getCtx();
ctx.finalize();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is finalize used for vs sync? Could you hide the context in the executor so the user doesn't need it, and calling exec.sync() calls finalize()?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

finalize terminates everything in the stf context, it waits for asynchronous tasks, deletes internal resources etc... you can only do it once, sync is more equivalent to a ctx.task_fence() which is a non blocking fence (it returns a CUDA stream, and waiting for that stream means everything was done).

I'd like to move finalize to the dtor of the executor, but there are some caveats if you define the executor as a static variable, is this allowed ? The caveat might be some inappropriate unload ordering of CUDA and STF libraries as usual ...

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. I think the destructor is the right place. but does sync() work as expected?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sidelnik is it doing a task fence with a stream sync ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@caugonnet , sync() should be calling ctx.task_fence() now. I agree, I think we should place the ctx.finalize() inside the stf executor dtor

#endif

// example-end sync-test-1
printf ("max l2 norm: %f\n", (float)sqrt(maxn()));

Expand Down
33 changes: 29 additions & 4 deletions examples/fft_conv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
{
MATX_ENTER_HANDLER();
using complex = cuda::std::complex<float>;
#if 0
cudaExecutor exec{};
#else
stfExecutor exec{};
auto ctx = exec.getCtx();
#endif

index_t signal_size = 1ULL << 16;
index_t filter_size = 16;
Expand Down Expand Up @@ -117,7 +122,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
// Perform the FFT in-place on both signal and filter
for (int i = 0; i < iterations; i++) {
if (i == 1) {
#if 0
cudaEventRecord(start, stream);
#else
cudaEventRecord(start, ctx.task_fence());
#endif
}
(sig_freq = fft(sig_time, filtered_size)).run(exec);
(filt_freq = fft(filt_time, filtered_size)).run(exec);
Expand All @@ -129,18 +138,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)

}

#if 0
cudaEventRecord(stop, stream);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Eventually we should mask these events behind the executor as well so the timing is the same regardless of the executor.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes this makes it look like the code is very different for both executors but that timing is the sole reason especially if finalize is moved to the dtor

#else
cudaEventRecord(stop, ctx.task_fence());
#endif
exec.sync();
cudaEventElapsedTime(&separate_ms, start, stop);

for (int i = 0; i < iterations; i++) {
if (i == 1) {
cudaEventRecord(start, stream);
#if 0
cudaEventRecord(start, stream);
#else
cudaEventRecord(start, ctx.task_fence());
#endif
}
(sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec);
}


#if 0
cudaEventRecord(stop, stream);
#else
cudaEventRecord(stop, ctx.task_fence());
#endif
exec.sync();
cudaEventElapsedTime(&fused_ms, start, stop);

Expand All @@ -153,7 +174,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
(time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);

exec.sync();


#if 1
ctx.finalize();
#endif

// Compare signals
for (index_t b = 0; b < batches; b++) {
for (index_t i = 0; i < filtered_size; i++) {
Expand All @@ -172,4 +197,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)

CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
}
}
83 changes: 65 additions & 18 deletions examples/simple_radar_pipeline.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,31 +39,60 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
index_t numPulses = 128;
index_t numSamples = 9000;
index_t waveformLength = 1000;
constexpr bool ENABLE_GRAPHS = false;
uint32_t iterations = 100;
constexpr int num_streams = 1;
cudaGraph_t graphs[num_streams];
cudaGraphExec_t instances[num_streams];
using complex = cuda::std::complex<float>;
RadarPipeline<complex> *pipelines[num_streams];

#if 0
constexpr int numStreams = 8;
#else
int numStreams = 1;
#endif

// Parse command-line arguments
for (int i = 1; i < argc; ++i) {
std::string arg = argv[i];

if (arg == "--numChannels" && i + 1 < argc) {
numChannels = std::stoi(argv[++i]);
} else if (arg == "--numPulses" && i + 1 < argc) {
numPulses = std::stoi(argv[++i]);
} else if (arg == "--numSamples" && i + 1 < argc) {
numSamples = std::stoi(argv[++i]);
} else if (arg == "--waveformLength" && i + 1 < argc) {
waveformLength = std::stoi(argv[++i]);
} else if (arg == "--iterations" && i + 1 < argc) {
iterations = std::stoi(argv[++i]);
} else if (arg == "--numStreams" && i + 1 < argc) {
numStreams = std::stoi(argv[++i]);
} else {
std::cerr << "Unknown option or missing value: " << arg << std::endl;
return 1; // Exit with error
}
}

std::cout << "Iterations: " << iterations << std::endl;
std::cout << "numChannels: " << numChannels << std::endl;
std::cout << "numPulses: " << numPulses << std::endl;
std::cout << "numNumSamples: " << numSamples << std::endl;
std::cout << "numSamples: " << numSamples << std::endl;
std::cout << "waveformLength: " << waveformLength << std::endl;
std::cout << "numStreams: " << numStreams << std::endl;

constexpr bool ENABLE_GRAPHS = false;
cudaGraph_t graphs[numStreams];
cudaGraphExec_t instances[numStreams];
using complex = cuda::std::complex<float>;
RadarPipeline<complex> *pipelines[numStreams];

// cuda stream to place work in
cudaStream_t streams[num_streams];
cudaStream_t streams[numStreams];

// manually set to log all NVTX levels
MATX_NVTX_SET_LOG_LEVEL( matx_nvxtLogLevels::MATX_NVTX_LOG_ALL );

// create some events for timing
cudaEvent_t starts[num_streams];
cudaEvent_t stops[num_streams];
cudaEvent_t starts[numStreams];
cudaEvent_t stops[numStreams];

for (int s = 0; s < num_streams; s++) {
for (int s = 0; s < numStreams; s++) {
cudaEventCreate(&starts[s]);
cudaEventCreate(&stops[s]);
cudaStreamCreate(&streams[s]);
Expand Down Expand Up @@ -98,12 +127,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
};

// Warmup
for (int s = 0; s < num_streams; s++) {
for (int s = 0; s < numStreams; s++) {
run_pipeline(s);
}

if (ENABLE_GRAPHS) {
for (int s = 0; s < num_streams; s++) {
for (int s = 0; s < numStreams; s++) {
cudaStreamBeginCapture(streams[s], cudaStreamCaptureModeGlobal);
run_pipeline(s);
cudaStreamEndCapture(streams[s], &graphs[s]);
Expand All @@ -112,9 +141,14 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
}

for (uint32_t i = 0; i < iterations; i++) {
for (int s = 0; s < num_streams; s++) {
for (int s = 0; s < numStreams; s++) {
if (i == 1) {
#ifdef USE_STF
auto ctx = pipelines[s]->exec.getCtx();
cudaEventRecord(starts[s], ctx.task_fence());
#else
cudaEventRecord(starts[s], streams[s]);
#endif
}

if (ENABLE_GRAPHS) {
Expand All @@ -126,24 +160,37 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
}
}

for (int s = 0; s < num_streams; s++) {
for (int s = 0; s < numStreams; s++) {
#ifdef USE_STF
auto ctx = pipelines[s]->exec.getCtx();
cudaEventRecord(stops[s], ctx.task_fence());
#else
cudaEventRecord(stops[s], streams[s]);
#endif
pipelines[s]->sync();
}

#ifdef USE_STF
for (int s = 0; s < numStreams; s++) {
auto ctx = pipelines[s]->exec.getCtx();
ctx.finalize();
}
#endif

MATX_NVTX_END_RANGE(2)

MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3)
float time_ms;
cudaEventElapsedTime(&time_ms, starts[num_streams-1], stops[num_streams-1]);
cudaEventElapsedTime(&time_ms, starts[numStreams-1], stops[numStreams-1]);
float time_s = time_ms * .001f;

auto mult = iterations * numChannels * numPulses * num_streams;
auto mult = iterations * numChannels * numPulses * numStreams;
printf("Pipeline finished in %.2fms, rate: %.2f pulses/channel/sec (%.2f Gbps)\n",
time_ms,
static_cast<float>(mult) / time_s,
static_cast<float>(mult*sizeof(complex)*numSamples*8)/time_s/1e9);

for (int s = 0; s < num_streams; s++) {
for (int s = 0; s < numStreams; s++) {
cudaEventDestroy(starts[s]);
cudaEventDestroy(stops[s]);
cudaStreamDestroy(streams[s]);
Expand Down
11 changes: 11 additions & 0 deletions examples/simple_radar_pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
#include <memory>
#include <stdint.h>

#ifndef USE_STF
#define USE_STF 1
#endif

using namespace matx;

/**
Expand Down Expand Up @@ -120,6 +124,7 @@ class RadarPipeline {
RadarPipeline() = delete;
~RadarPipeline()
{
std::cout << "DTOR for radar\n";

}

Expand All @@ -137,6 +142,7 @@ class RadarPipeline {
: numPulses(_numPulses), numSamples(_numSamples), waveformLength(_wfLen),
numChannels(_numChannels), stream(_stream), exec(_stream)
{
std::cout << "CTOR for pipeline\n";
numSamplesRnd = 1;
while (numSamplesRnd < numSamples) {
numSamplesRnd *= 2;
Expand Down Expand Up @@ -465,5 +471,10 @@ class RadarPipeline {
tensor_t<typename ComplexType::value_type, 2> cfarMaskView;

cudaStream_t stream;
#ifdef USE_STF
public:
stfExecutor exec;
#else
cudaExecutor exec;
#endif
};
Loading