Open3D (C++ API)  0.16.0
StdGPUHashBackend.h
Go to the documentation of this file.
1// ----------------------------------------------------------------------------
2// - Open3D: www.open3d.org -
3// ----------------------------------------------------------------------------
4// The MIT License (MIT)
5//
6// Copyright (c) 2018-2021 www.open3d.org
7//
8// Permission is hereby granted, free of charge, to any person obtaining a copy
9// of this software and associated documentation files (the "Software"), to deal
10// in the Software without restriction, including without limitation the rights
11// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12// copies of the Software, and to permit persons to whom the Software is
13// furnished to do so, subject to the following conditions:
14//
15// The above copyright notice and this permission notice shall be included in
16// all copies or substantial portions of the Software.
17//
18// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24// IN THE SOFTWARE.
25// ----------------------------------------------------------------------------
26
27#pragma once
28
29#include <stdgpu/memory.h>
30#include <thrust/device_vector.h>
31#include <thrust/transform.h>
32
33#include <stdgpu/unordered_map.cuh>
34#include <type_traits>
35
41
42namespace open3d {
43namespace core {
44
50template <typename T>
52public:
54 using value_type = T;
55
57 StdGPUAllocator() = default;
58
60 explicit StdGPUAllocator(const Device& device) : std_allocator_(device) {}
61
64
67
70
73
75 template <typename U>
77 : std_allocator_(other.std_allocator_) {}
78
81 if (!GetDevice().IsCUDA()) {
82 utility::LogError("Unsupported device.");
83 }
84
85 T* p = std_allocator_.allocate(n);
86 stdgpu::register_memory(p, n, stdgpu::dynamic_memory_type::device);
87 return p;
88 }
89
91 void deallocate(T* p, std::size_t n) {
92 if (!GetDevice().IsCUDA()) {
93 utility::LogError("Unsupported device.");
94 }
95
96 stdgpu::deregister_memory(p, n, stdgpu::dynamic_memory_type::device);
97 std_allocator_.deallocate(p, n);
98 }
99
101 bool operator==(const StdGPUAllocator& other) {
102 return std_allocator_ == other.std_allocator_;
103 }
104
106 bool operator!=(const StdGPUAllocator& other) { return !operator==(other); }
107
109 Device GetDevice() const { return std_allocator_.GetDevice(); }
110
111private:
112 // Allow access in rebind constructor.
113 template <typename T2>
114 friend class StdGPUAllocator;
115
116 StdAllocator<T> std_allocator_;
117};
118
119// These typedefs must be defined outside of StdGPUHashBackend to make them
120// accessible in raw CUDA kernels.
121template <typename Key>
124
125template <typename Key, typename Hash, typename Eq>
127 stdgpu::unordered_map<Key,
129 Hash,
130 Eq,
132
133template <typename Key, typename Hash, typename Eq>
135public:
136 StdGPUHashBackend(int64_t init_capacity,
137 int64_t key_dsize,
138 const std::vector<int64_t>& value_dsizes,
139 const Device& device);
141
142 void Reserve(int64_t capacity) override;
143
144 void Insert(const void* input_keys,
145 const std::vector<const void*>& input_values_soa,
146 buf_index_t* output_buf_indices,
147 bool* output_masks,
148 int64_t count) override;
149
150 void Find(const void* input_keys,
151 buf_index_t* output_buf_indices,
152 bool* output_masks,
153 int64_t count) override;
154
155 void Erase(const void* input_keys,
156 bool* output_masks,
157 int64_t count) override;
158
159 int64_t GetActiveIndices(buf_index_t* output_indices) override;
160
161 void Clear() override;
162
163 int64_t Size() const override;
164
165 int64_t GetBucketCount() const override;
166 std::vector<int64_t> BucketSizes() const override;
167 float LoadFactor() const override;
168
170
171 void Allocate(int64_t capacity);
172 void Free();
173
174protected:
175 // Use reference, since the structure itself is implicitly handled as a
176 // pointer directly by stdgpu.
178
180};
181
182template <typename Key, typename Hash, typename Eq>
184 int64_t init_capacity,
185 int64_t key_dsize,
186 const std::vector<int64_t>& value_dsizes,
187 const Device& device)
188 : DeviceHashBackend(init_capacity, key_dsize, value_dsizes, device) {
189 Allocate(init_capacity);
190}
191
192template <typename Key, typename Hash, typename Eq>
194 Free();
195}
196
197template <typename Key, typename Hash, typename Eq>
199 return impl_.size();
200}
201
202// Need an explicit kernel for non-const access to map
203template <typename Key, typename Hash, typename Eq>
205 CUDAHashBackendBufferAccessor buffer_accessor,
206 const Key* input_keys,
207 buf_index_t* output_buf_indices,
208 bool* output_masks,
209 int64_t count) {
210 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
211 if (tid >= count) return;
212
213 Key key = input_keys[tid];
214 auto iter = map.find(key);
215 bool flag = (iter != map.end());
216 output_masks[tid] = flag;
217 output_buf_indices[tid] = flag ? iter->second : 0;
218}
219
220template <typename Key, typename Hash, typename Eq>
221void StdGPUHashBackend<Key, Hash, Eq>::Find(const void* input_keys,
222 buf_index_t* output_buf_indices,
223 bool* output_masks,
224 int64_t count) {
225 uint32_t threads = 128;
226 uint32_t blocks = (count + threads - 1) / threads;
227
228 STDGPUFindKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
229 impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
230 output_buf_indices, output_masks, count);
231 cuda::Synchronize(this->device_);
232}
233
234// Need an explicit kernel for non-const access to map
235template <typename Key, typename Hash, typename Eq>
237 CUDAHashBackendBufferAccessor buffer_accessor,
238 const Key* input_keys,
239 buf_index_t* output_buf_indices,
240 bool* output_masks,
241 int64_t count) {
242 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
243 if (tid >= count) return;
244
245 Key key = input_keys[tid];
246 auto iter = map.find(key);
247 bool flag = (iter != map.end());
248 output_masks[tid] = flag;
249 output_buf_indices[tid] = flag ? iter->second : 0;
250
251 if (output_masks[tid]) {
252 output_masks[tid] = map.erase(key);
253 if (output_masks[tid]) {
254 buffer_accessor.DeviceFree(output_buf_indices[tid]);
255 }
256 }
257}
258
259template <typename Key, typename Hash, typename Eq>
260void StdGPUHashBackend<Key, Hash, Eq>::Erase(const void* input_keys,
261 bool* output_masks,
262 int64_t count) {
263 uint32_t threads = 128;
264 uint32_t blocks = (count + threads - 1) / threads;
265
266 core::Tensor toutput_buf_indices =
267 core::Tensor({count}, core::Int32, this->device_);
268 buf_index_t* output_buf_indices =
269 static_cast<buf_index_t*>(toutput_buf_indices.GetDataPtr());
270
271 STDGPUEraseKernel<<<blocks, threads, 0, core::cuda::GetStream()>>>(
272 impl_, buffer_accessor_, static_cast<const Key*>(input_keys),
273 output_buf_indices, output_masks, count);
274 cuda::Synchronize(this->device_);
275}
276
277template <typename Key>
280 operator()(const thrust::pair<Key, buf_index_t>& x) const {
281 return x.second;
282 }
283};
284
285template <typename Key, typename Hash, typename Eq>
287 buf_index_t* output_indices) {
288 auto range = impl_.device_range();
289
290 thrust::transform(range.begin(), range.end(), output_indices,
292
293 return impl_.size();
294}
295
296template <typename Key, typename Hash, typename Eq>
298 impl_.clear();
299 this->buffer_->ResetHeap();
300}
301
302template <typename Key, typename Hash, typename Eq>
304
305template <typename Key, typename Hash, typename Eq>
307 return impl_.bucket_count();
308}
309
310template <typename Key, typename Hash, typename Eq>
312 utility::LogError("Unimplemented");
313}
314
315template <typename Key, typename Hash, typename Eq>
317 return impl_.load_factor();
318}
319
320// Need an explicit kernel for non-const access to map
321template <typename Key, typename Hash, typename Eq, typename block_t>
322__global__ void STDGPUInsertKernel(
324 CUDAHashBackendBufferAccessor buffer_accessor,
325 const Key* input_keys,
326 const void* const* input_values_soa,
327 buf_index_t* output_buf_indices,
328 bool* output_masks,
329 int64_t count,
330 int64_t n_values) {
331 uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
332 if (tid >= count) return;
333
334 Key key = input_keys[tid];
335 output_buf_indices[tid] = 0;
336 output_masks[tid] = false;
337
338 // First apply 'try insert' with a dummy index
339 auto res = map.emplace(key, 0);
340
341 // If success, change the iterator and provide the actual index
342 if (res.second) {
343 buf_index_t buf_index = buffer_accessor.DeviceAllocate();
344 auto key_ptr = buffer_accessor.GetKeyPtr(buf_index);
345
346 // Copy templated key to buffer (duplicate)
347 // TODO: hack stdgpu inside and take out the buffer directly
348 *static_cast<Key*>(key_ptr) = key;
349
350 // Copy/reset non-templated value in buffer
351 for (int j = 0; j < n_values; ++j) {
352 const int64_t blocks_per_element =
353 buffer_accessor.value_blocks_per_element_[j];
354
355 block_t* dst_value = static_cast<block_t*>(
356 buffer_accessor.GetValuePtr(buf_index, j));
357 const block_t* src_value =
358 static_cast<const block_t*>(input_values_soa[j]) +
359 blocks_per_element * tid;
360 for (int b = 0; b < blocks_per_element; ++b) {
361 dst_value[b] = src_value[b];
362 }
363 }
364
365 // Update from the dummy index
366 res.first->second = buf_index;
367
368 // Write to return variables
369 output_buf_indices[tid] = buf_index;
370 output_masks[tid] = true;
371 }
372}
373
374template <typename Key, typename Hash, typename Eq>
376 const void* input_keys,
377 const std::vector<const void*>& input_values_soa,
378 buf_index_t* output_buf_indices,
379 bool* output_masks,
380 int64_t count) {
381 uint32_t threads = 128;
382 uint32_t blocks = (count + threads - 1) / threads;
383
384 thrust::device_vector<const void*> input_values_soa_device(
385 input_values_soa.begin(), input_values_soa.end());
386
387 int64_t n_values = input_values_soa.size();
388 const void* const* ptr_input_values_soa =
389 thrust::raw_pointer_cast(input_values_soa_device.data());
390
391 DISPATCH_DIVISOR_SIZE_TO_BLOCK_T(
392 buffer_accessor_.common_block_size_, [&]() {
393 STDGPUInsertKernel<Key, Hash, Eq, block_t>
394 <<<blocks, threads, 0, core::cuda::GetStream()>>>(
395 impl_, buffer_accessor_,
396 static_cast<const Key*>(input_keys),
397 ptr_input_values_soa, output_buf_indices,
398 output_masks, count, n_values);
399 });
400 cuda::Synchronize(this->device_);
401}
402
403template <typename Key, typename Hash, typename Eq>
405 this->capacity_ = capacity;
406
407 // Allocate buffer for key values.
408 this->buffer_ = std::make_shared<HashBackendBuffer>(
409 this->capacity_, this->key_dsize_, this->value_dsizes_,
410 this->device_);
411 buffer_accessor_.Setup(*this->buffer_);
412
413 // stdgpu initializes on the default stream. Set the current stream to
414 // ensure correct behavior.
415 {
416 CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
417
419 this->capacity_,
421 cuda::Synchronize(this->device_);
422 }
423}
424
425template <typename Key, typename Hash, typename Eq>
427 // Buffer is automatically handled by the smart pointer.
428 buffer_accessor_.Shutdown(this->device_);
429
430 // stdgpu initializes on the default stream. Set the current stream to
431 // ensure correct behavior.
432 {
433 CUDAScopedStream scoped_stream(cuda::GetDefaultStream());
434
436 }
437}
438} // namespace core
439} // namespace open3d
Common CUDA utilities.
#define OPEN3D_HOST_DEVICE
Definition: CUDAUtils.h:63
#define LogError(...)
Definition: Logging.h:67
Definition: CUDAHashBackendBufferAccessor.h:43
int64_t * value_blocks_per_element_
Definition: CUDAHashBackendBufferAccessor.h:127
__device__ void * GetValuePtr(buf_index_t ptr, int value_idx=0)
Definition: CUDAHashBackendBufferAccessor.h:110
__device__ buf_index_t DeviceAllocate()
Definition: CUDAHashBackendBufferAccessor.h:98
__device__ void * GetKeyPtr(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:107
__device__ void DeviceFree(buf_index_t ptr)
Definition: CUDAHashBackendBufferAccessor.h:102
Definition: DeviceHashBackend.h:39
Definition: Device.h:37
Definition: StdAllocator.h:42
Definition: StdGPUHashBackend.h:51
T * allocate(std::size_t n)
Allocates memory of size n.
Definition: StdGPUHashBackend.h:80
StdGPUAllocator()=default
Default constructor.
StdGPUAllocator & operator=(const StdGPUAllocator &)=default
Default copy assignment operator.
StdGPUAllocator(const StdGPUAllocator &)=default
Default copy constructor.
void deallocate(T *p, std::size_t n)
Deallocates memory from pointer p of size n .
Definition: StdGPUHashBackend.h:91
bool operator==(const StdGPUAllocator &other)
Returns true if the instances are equal, false otherwise.
Definition: StdGPUHashBackend.h:101
StdGPUAllocator & operator=(StdGPUAllocator &&)=default
Default move assignment operator.
StdGPUAllocator(StdGPUAllocator &&)=default
Default move constructor.
T value_type
T.
Definition: StdGPUHashBackend.h:54
bool operator!=(const StdGPUAllocator &other)
Returns true if the instances are not equal, false otherwise.
Definition: StdGPUHashBackend.h:106
StdGPUAllocator(const Device &device)
Constructor from device.
Definition: StdGPUHashBackend.h:60
StdGPUAllocator(const StdGPUAllocator< U > &other)
Rebind copy constructor.
Definition: StdGPUHashBackend.h:76
Device GetDevice() const
Returns the device on which memory is allocated.
Definition: StdGPUHashBackend.h:109
Definition: StdGPUHashBackend.h:134
StdGPUHashBackend(int64_t init_capacity, int64_t key_dsize, const std::vector< int64_t > &value_dsizes, const Device &device)
Definition: StdGPUHashBackend.h:183
void Erase(const void *input_keys, bool *output_masks, int64_t count) override
Parallel erase a contiguous array of keys.
Definition: StdGPUHashBackend.h:260
~StdGPUHashBackend()
Definition: StdGPUHashBackend.h:193
float LoadFactor() const override
Get the current load factor, defined as size / bucket count.
Definition: StdGPUHashBackend.h:316
InternalStdGPUHashBackend< Key, Hash, Eq > GetImpl() const
Definition: StdGPUHashBackend.h:169
void Find(const void *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel find a contiguous array of keys.
Definition: StdGPUHashBackend.h:221
void Free()
Definition: StdGPUHashBackend.h:426
void Insert(const void *input_keys, const std::vector< const void * > &input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count) override
Parallel insert contiguous arrays of keys and values.
Definition: StdGPUHashBackend.h:375
std::vector< int64_t > BucketSizes() const override
Get the number of entries per bucket.
Definition: StdGPUHashBackend.h:311
InternalStdGPUHashBackend< Key, Hash, Eq > impl_
Definition: StdGPUHashBackend.h:177
void Reserve(int64_t capacity) override
Definition: StdGPUHashBackend.h:303
int64_t GetBucketCount() const override
Get the number of buckets of the hash map.
Definition: StdGPUHashBackend.h:306
int64_t GetActiveIndices(buf_index_t *output_indices) override
Parallel collect all iterators in the hash table.
Definition: StdGPUHashBackend.h:286
int64_t Size() const override
Get the size (number of valid entries) of the hash map.
Definition: StdGPUHashBackend.h:198
void Allocate(int64_t capacity)
Definition: StdGPUHashBackend.h:404
CUDAHashBackendBufferAccessor buffer_accessor_
Definition: StdGPUHashBackend.h:179
void Clear() override
Clear stored map without reallocating memory.
Definition: StdGPUHashBackend.h:297
Definition: Tensor.h:51
T * GetDataPtr()
Definition: Tensor.h:1149
int count
Definition: FilePCD.cpp:61
void Synchronize()
Definition: CUDAUtils.cpp:77
__global__ void STDGPUFindKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: StdGPUHashBackend.h:204
uint32_t buf_index_t
Definition: HashBackendBuffer.h:63
__global__ void STDGPUEraseKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, buf_index_t *output_buf_indices, bool *output_masks, int64_t count)
Definition: StdGPUHashBackend.h:236
const Dtype Int32
Definition: Dtype.cpp:65
stdgpu::unordered_map< Key, buf_index_t, Hash, Eq, InternalStdGPUHashBackendAllocator< Key > > InternalStdGPUHashBackend
Definition: StdGPUHashBackend.h:131
__global__ void STDGPUInsertKernel(InternalStdGPUHashBackend< Key, Hash, Eq > map, CUDAHashBackendBufferAccessor buffer_accessor, const Key *input_keys, const void *const *input_values_soa, buf_index_t *output_buf_indices, bool *output_masks, int64_t count, int64_t n_values)
Definition: StdGPUHashBackend.h:322
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition: K4aPlugin.cpp:567
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle image_handle timestamp_usec white_balance image_handle k4a_device_configuration_t config device_handle char size_t serial_number_size bool int32_t int32_t int32_t int32_t k4a_color_control_mode_t default_mode value const const k4a_calibration_t calibration char size_t
Definition: K4aPlugin.cpp:738
Definition: PinholeCameraIntrinsic.cpp:35
Definition: StdGPUHashBackend.h:278
OPEN3D_HOST_DEVICE buf_index_t operator()(const thrust::pair< Key, buf_index_t > &x) const
Definition: StdGPUHashBackend.h:280