-
Notifications
You must be signed in to change notification settings - Fork 97
Cudastf #794
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Cudastf #794
Changes from 6 commits
d6dc01d
245b20f
9b35ec8
7d298d4
154b3f9
c8ef988
52b18c9
d726b10
5e7576c
1373699
92e7204
a608f3f
b062577
bbf9abc
3e831ea
702fe79
5bfe21e
f407256
221599c
7a5bb6c
0c2432f
3ae267b
6a75794
f1facca
0199e75
39b16f4
9b7c4b0
f9e09f1
6c9a791
a1efd1c
6437eab
bbb9aae
e13c9b6
7244399
66f6850
89e2a43
973886b
92885e7
8607840
14e0985
92e04d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,9 @@ | ||
{ | ||
"packages": { | ||
"CCCL": { | ||
"version": "2.7.0-rc2", | ||
"git_shallow": true, | ||
"version": "2.8.0", | ||
"git_url": "https://github.com/NVIDIA/cccl.git", | ||
"git_tag": "10e915ac7b79a1ab3b9d7a795c621b47b122f513" | ||
"git_tag": "cb1fce5e1cb7362940bd7e74ab8fbf01942b6264" | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -73,7 +73,12 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) | |
{ | ||
MATX_ENTER_HANDLER(); | ||
using complex = cuda::std::complex<float>; | ||
#if 0 | ||
cudaExecutor exec{}; | ||
#else | ||
stfExecutor exec{}; | ||
auto ctx = exec.getCtx(); | ||
#endif | ||
|
||
index_t signal_size = 1ULL << 16; | ||
index_t filter_size = 16; | ||
|
@@ -117,7 +122,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) | |
// Perform the FFT in-place on both signal and filter | ||
for (int i = 0; i < iterations; i++) { | ||
if (i == 1) { | ||
#if 0 | ||
cudaEventRecord(start, stream); | ||
#else | ||
cudaEventRecord(start, ctx.task_fence()); | ||
#endif | ||
} | ||
(sig_freq = fft(sig_time, filtered_size)).run(exec); | ||
(filt_freq = fft(filt_time, filtered_size)).run(exec); | ||
|
@@ -129,18 +138,30 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) | |
|
||
} | ||
|
||
#if 0 | ||
cudaEventRecord(stop, stream); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Eventually we should mask these events behind the executor as well so the timing is the same regardless of the executor. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes this makes it look like the code is very different for both executors but that timing is the sole reason especially if finalize is moved to the dtor |
||
#else | ||
cudaEventRecord(stop, ctx.task_fence()); | ||
#endif | ||
exec.sync(); | ||
cudaEventElapsedTime(&separate_ms, start, stop); | ||
|
||
for (int i = 0; i < iterations; i++) { | ||
if (i == 1) { | ||
cudaEventRecord(start, stream); | ||
#if 0 | ||
cudaEventRecord(start, stream); | ||
#else | ||
cudaEventRecord(start, ctx.task_fence()); | ||
#endif | ||
} | ||
(sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(exec); | ||
} | ||
|
||
|
||
#if 0 | ||
cudaEventRecord(stop, stream); | ||
#else | ||
cudaEventRecord(stop, ctx.task_fence()); | ||
#endif | ||
exec.sync(); | ||
cudaEventElapsedTime(&fused_ms, start, stop); | ||
|
||
|
@@ -153,7 +174,11 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) | |
(time_out = conv1d(sig_time, filt1, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec); | ||
|
||
exec.sync(); | ||
|
||
|
||
#if 1 | ||
ctx.finalize(); | ||
#endif | ||
|
||
// Compare signals | ||
for (index_t b = 0; b < batches; b++) { | ||
for (index_t i = 0; i < filtered_size; i++) { | ||
|
@@ -172,4 +197,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) | |
|
||
CUDA_CHECK_LAST_ERROR(); | ||
MATX_EXIT_HANDLER(); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -91,6 +91,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
using stride_container = typename Desc::stride_container; | ||
using desc_type = Desc; ///< Descriptor type trait | ||
using self_type = tensor_t<T, RANK, Storage, Desc>; | ||
using stf_logicaldata_type = typename cuda::experimental::stf::logical_data<cuda::experimental::stf::void_interface>; | ||
|
||
/** | ||
* @brief Construct a new 0-D tensor t object | ||
|
@@ -107,7 +108,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
* @param rhs Object to copy from | ||
*/ | ||
__MATX_HOST__ tensor_t(tensor_t const &rhs) noexcept | ||
: detail::tensor_impl_t<T, RANK, Desc>{rhs.ldata_, rhs.desc_}, storage_(rhs.storage_) | ||
: detail::tensor_impl_t<T, RANK, Desc>{rhs.ldata_, rhs.desc_, rhs.stf_ldata_}, storage_(rhs.storage_) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be good to understand why this extra data member is needed, because this pointer exists on the device potentially many times, so it can increase the size of the operator. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's where a careful review of the design is needed ... Our logical data class tracks the use of a specific piece of data, your tensor seems to be a view to some data (with shapes and so on), so it's ok to have just the pointer and shapes, but in STF we do need to keep track of the internal state of the data (who owns a copy, which tasks depend on it, etc...). This is what the logical data does on your behalf and which your tensors cannot do by merely using the pointer. One conservative take is to say that if you slice a tensor, this is the SAME logical data, so that further concurrent write accesses are serialized. This is sub-optimal when you have non overlapping slices but we cannot do better in a simple strategy. This ensures correctness but not optimality for concurrency There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cliffburdick you say it exists many times on the device, but isn't this a host only class ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. tensor_t is host/device, but tensor_impl_t is device-only There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then i'm even surprised a logical_data can exist in device code, or the storage for it ! But this may be a pointer to an optional logical data ... We need to improve that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ugh I mistyped. tensor_t is ONLY on the host. tensor_impl_t is both. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Still, it's surprising that we allow the logical data pointer to go on a device There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ldata is local data, and is ultimately just a raw pointer that points to the data needed on the device. This may be the same as the base pointer, or it may be something like a strided/offset pointer. |
||
{ } | ||
|
||
/** | ||
|
@@ -116,7 +117,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
* @param rhs Object to move from | ||
*/ | ||
__MATX_HOST__ tensor_t(tensor_t &&rhs) noexcept | ||
: detail::tensor_impl_t<T, RANK, Desc>{rhs.ldata_, std::move(rhs.desc_)}, storage_(std::move(rhs.storage_)) | ||
: detail::tensor_impl_t<T, RANK, Desc>{rhs.ldata_, std::move(rhs.desc_), rhs.stf_ldata_}, storage_(std::move(rhs.storage_)) | ||
{ } | ||
|
||
|
||
|
@@ -134,6 +135,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
this->ldata_ = rhs.ldata_; | ||
storage_ = rhs.storage_; | ||
this->desc_ = rhs.desc_; | ||
this->stf_ldata_ = rhs.stf_ldata_; | ||
} | ||
|
||
/** Swaps two tensors | ||
|
@@ -152,6 +154,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
std::swap(lhs.ldata_, rhs.ldata_); | ||
swap(lhs.storage_, rhs.storage_); | ||
swap(lhs.desc_, rhs.desc_); | ||
std::swap(lhs.stf_ldata_, rhs.stf_ldata_); | ||
} | ||
|
||
__MATX_INLINE__ ~tensor_t() = default; | ||
|
@@ -177,6 +180,16 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
this->SetLocalData(storage_.data()); | ||
} | ||
|
||
template <typename S2 = Storage, typename D2 = Desc, | ||
std::enable_if_t<is_matx_storage_v<typename remove_cvref<S2>::type> && is_matx_descriptor_v<typename remove_cvref<D2>::type>, bool> = true> | ||
tensor_t(S2 &&s, D2 &&desc, T* ldata, std::optional<stf_logicaldata_type > *stf_ldata_) : | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to do something about that type ... std::optional<stf_logicaldata_type > *stf_ldata_ The rationale is to be able to define a tensor before it is associated to an executor, so the logical data might be set lazily. |
||
detail::tensor_impl_t<T, RANK, Desc>{std::forward<D2>(desc)}, | ||
storage_{std::forward<S2>(s)} | ||
{ | ||
this->stf_ldata_ = stf_ldata_; | ||
this->SetLocalData(storage_.data()); | ||
} | ||
|
||
/** | ||
* @brief Construct a new tensor t object. Used to copy an existing storage object for proper reference counting | ||
* | ||
|
@@ -185,13 +198,28 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
* @param ldata | ||
*/ | ||
template <typename D2 = Desc> | ||
tensor_t(Storage s, D2 &&desc, T* ldata) : | ||
tensor_t(Storage s, D2 &&desc, T* ldata, std::optional<stf_logicaldata_type > *stf_ldata) : | ||
detail::tensor_impl_t<T, RANK, D2>{std::forward<D2>(desc)}, | ||
storage_{std::move(s)} | ||
{ | ||
this->stf_ldata_ = stf_ldata; | ||
this->SetLocalData(ldata); | ||
} | ||
|
||
/** | ||
* @brief Construct a new tensor t object. Used to copy an existing storage object for proper reference counting | ||
* | ||
* @param s | ||
* @param desc | ||
* @param ldata | ||
*/ | ||
template <typename D2 = Desc> | ||
tensor_t(Storage s, D2 &&desc, T* ldata) : | ||
detail::tensor_impl_t<T, RANK, D2>{std::forward<D2>(desc)}, | ||
storage_{std::move(s)} | ||
{ | ||
this->SetLocalData(ldata); | ||
} | ||
|
||
/** | ||
* Constructor for a rank-1 and above tensor. | ||
|
@@ -646,7 +674,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
|
||
// Copy descriptor and call ctor with shape | ||
Desc new_desc{std::forward<Shape>(shape)}; | ||
return tensor_t<M, R, Storage, Desc>{storage_, std::move(new_desc), this->ldata_}; | ||
return tensor_t<M, R, Storage, Desc>{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_}; | ||
} | ||
|
||
/** | ||
|
@@ -705,7 +733,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
"To get a reshaped view the tensor must be compact"); | ||
|
||
DefaultDescriptor<tshape.size()> desc{std::move(tshape)}; | ||
return tensor_t<T, NRANK, Storage, decltype(desc)>{storage_, std::move(desc), this->ldata_}; | ||
return tensor_t<T, NRANK, Storage, decltype(desc)>{storage_, std::move(desc), this->ldata_, this->stf_ldata_}; | ||
} | ||
|
||
/** | ||
|
@@ -788,7 +816,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
|
||
// Copy descriptor and call ctor with shape | ||
Desc new_desc{this->desc_.Shape(), std::move(strides)}; | ||
return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data}; | ||
return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data, this->stf_ldata_}; | ||
} | ||
|
||
/** | ||
|
@@ -831,7 +859,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
} | ||
|
||
Desc new_desc{this->desc_.Shape(), std::move(strides)}; | ||
return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data}; | ||
return tensor_t<Type, RANK, Storage, Desc>{storage_, std::move(new_desc), data, this->stf_ldata_}; | ||
} | ||
|
||
/** | ||
|
@@ -854,7 +882,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) | ||
|
||
auto new_desc = this->PermuteImpl(dims); | ||
return tensor_t<T, RANK, Storage, Desc>{storage_, std::move(new_desc), this->ldata_}; | ||
return tensor_t<T, RANK, Storage, Desc>{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_}; | ||
} | ||
|
||
|
||
|
@@ -1030,7 +1058,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
OverlapView(const cuda::std::array<typename Desc::shape_type, N> &windows, | ||
const cuda::std::array<typename Desc::stride_type, N> &strides) const { | ||
auto new_desc = this->template OverlapViewImpl<N>(windows, strides); | ||
return tensor_t<T, RANK + 1, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->ldata_}; | ||
return tensor_t<T, RANK + 1, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_}; | ||
} | ||
|
||
/** | ||
|
@@ -1064,7 +1092,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) | ||
|
||
auto new_desc = this->template CloneImpl<N>(clones); | ||
return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->ldata_}; | ||
return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), this->ldata_, this->stf_ldata_}; | ||
} | ||
|
||
template <int N> | ||
|
@@ -1362,7 +1390,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> { | |
[[maybe_unused]] StrideType strides) const | ||
{ | ||
auto [new_desc, data] = this->template SliceImpl<N, StrideType>(firsts, ends, strides); | ||
return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), data}; | ||
return tensor_t<T, N, Storage, decltype(new_desc)>{storage_, std::move(new_desc), data, this->stf_ldata_}; | ||
} | ||
|
||
template <typename StrideType, int N = RANK> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what is
finalize
used for vssync
? Could you hide the context in the executor so the user doesn't need it, and callingexec.sync()
callsfinalize()
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
finalize terminates everything in the stf context, it waits for asynchronous tasks, deletes internal resources etc... you can only do it once, sync is more equivalent to a ctx.task_fence() which is a non blocking fence (it returns a CUDA stream, and waiting for that stream means everything was done).
I'd like to move finalize to the dtor of the executor, but there are some caveats if you define the executor as a static variable, is this allowed ? The caveat might be some inappropriate unload ordering of CUDA and STF libraries as usual ...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good. I think the destructor is the right place. but does sync() work as expected?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@sidelnik is it doing a task fence with a stream sync ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@caugonnet , sync() should be calling ctx.task_fence() now. I agree, I think we should place the ctx.finalize() inside the stf executor dtor