Open3D (C++ API)  0.17.0
ParallelFor.h
Go to the documentation of this file.
1// ----------------------------------------------------------------------------
2// - Open3D: www.open3d.org -
3// ----------------------------------------------------------------------------
4// Copyright (c) 2018-2023 www.open3d.org
5// SPDX-License-Identifier: MIT
6// ----------------------------------------------------------------------------
7
8#pragma once
9
10#include <cstdint>
11#include <type_traits>
12
13#include "open3d/core/Device.h"
18
19#ifdef __CUDACC__
20#include <cuda.h>
21#include <cuda_runtime.h>
22
24#endif
25
26namespace open3d {
27namespace core {
28
29#ifdef __CUDACC__
30
31static constexpr int64_t OPEN3D_PARFOR_BLOCK = 128;
32static constexpr int64_t OPEN3D_PARFOR_THREAD = 4;
33
35template <int64_t block_size, int64_t thread_size, typename func_t>
36__global__ void ElementWiseKernel_(int64_t n, func_t f) {
37 int64_t items_per_block = block_size * thread_size;
38 int64_t idx = blockIdx.x * items_per_block + threadIdx.x;
39#pragma unroll
40 for (int64_t i = 0; i < thread_size; ++i) {
41 if (idx < n) {
42 f(idx);
43 idx += block_size;
44 }
45 }
46}
47
49template <typename func_t>
50void ParallelForCUDA_(const Device& device, int64_t n, const func_t& func) {
51 if (device.GetType() != Device::DeviceType::CUDA) {
52 utility::LogError("ParallelFor for CUDA cannot run on device {}.",
53 device.ToString());
54 }
55 if (n == 0) {
56 return;
57 }
58
59 CUDAScopedDevice scoped_device(device);
60 int64_t items_per_block = OPEN3D_PARFOR_BLOCK * OPEN3D_PARFOR_THREAD;
61 int64_t grid_size = (n + items_per_block - 1) / items_per_block;
62
63 ElementWiseKernel_<OPEN3D_PARFOR_BLOCK, OPEN3D_PARFOR_THREAD>
64 <<<grid_size, OPEN3D_PARFOR_BLOCK, 0, core::cuda::GetStream()>>>(
65 n, func);
66 OPEN3D_GET_LAST_CUDA_ERROR("ParallelFor failed.");
67}
68
69#else
70
72template <typename func_t>
73void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {
74 if (!device.IsCPU()) {
75 utility::LogError("ParallelFor for CPU cannot run on device {}.",
76 device.ToString());
77 }
78 if (n == 0) {
79 return;
80 }
81
82#pragma omp parallel for num_threads(utility::EstimateMaxThreads())
83 for (int64_t i = 0; i < n; ++i) {
84 func(i);
85 }
86}
87
88#endif
89
102template <typename func_t>
103void ParallelFor(const Device& device, int64_t n, const func_t& func) {
104#ifdef __CUDACC__
105 ParallelForCUDA_(device, n, func);
106#else
107 ParallelForCPU_(device, n, func);
108#endif
109}
110
157template <typename vec_func_t, typename func_t>
158void ParallelFor(const Device& device,
159 int64_t n,
160 const func_t& func,
161 const vec_func_t& vec_func) {
162#ifdef BUILD_ISPC_MODULE
163
164#ifdef __CUDACC__
165 ParallelForCUDA_(device, n, func);
166#else
167 int num_threads = utility::EstimateMaxThreads();
168 ParallelForCPU_(device, num_threads, [&](int64_t i) {
169 int64_t start = n * i / num_threads;
170 int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);
171 vec_func(start, end);
172 });
173#endif
174
175#else
176
177#ifdef __CUDACC__
178 ParallelForCUDA_(device, n, func);
179#else
180 ParallelForCPU_(device, n, func);
181#endif
182
183#endif
184}
185
186#ifdef BUILD_ISPC_MODULE
187
188// Internal helper macro.
189#define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
190 using namespace ispc; \
191 ISPCKernel(start, end, __VA_ARGS__);
192
193#else
194
195// Internal helper macro.
196#define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
197 utility::LogError( \
198 "ISPC module disabled. Unable to call vectorized kernel {}", \
199 OPEN3D_STRINGIFY(ISPCKernel));
200
201#endif
202
204#define OPEN3D_OVERLOADED_LAMBDA_(T, ISPCKernel, ...) \
205 [&](T, int64_t start, int64_t end) { \
206 OPEN3D_CALL_ISPC_KERNEL_( \
207 OPEN3D_CONCAT(ISPCKernel, OPEN3D_CONCAT(_, T)), start, end, \
208 __VA_ARGS__); \
209 }
210
220#define OPEN3D_VECTORIZED(ISPCKernel, ...) \
221 [&](int64_t start, int64_t end) { \
222 OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \
223 }
224
238#define OPEN3D_TEMPLATE_VECTORIZED(T, ISPCKernel, ...) \
239 [&](int64_t start, int64_t end) { \
240 static_assert(std::is_arithmetic<T>::value, \
241 "Data type is not an arithmetic type"); \
242 utility::Overload( \
243 OPEN3D_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__), \
244 OPEN3D_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel, __VA_ARGS__), \
245 OPEN3D_OVERLOADED_LAMBDA_(int8_t, ISPCKernel, __VA_ARGS__), \
246 OPEN3D_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel, __VA_ARGS__), \
247 OPEN3D_OVERLOADED_LAMBDA_(int16_t, ISPCKernel, __VA_ARGS__), \
248 OPEN3D_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel, __VA_ARGS__), \
249 OPEN3D_OVERLOADED_LAMBDA_(int32_t, ISPCKernel, __VA_ARGS__), \
250 OPEN3D_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel, __VA_ARGS__), \
251 OPEN3D_OVERLOADED_LAMBDA_(int64_t, ISPCKernel, __VA_ARGS__), \
252 OPEN3D_OVERLOADED_LAMBDA_(float, ISPCKernel, __VA_ARGS__), \
253 OPEN3D_OVERLOADED_LAMBDA_(double, ISPCKernel, __VA_ARGS__), \
254 [&](auto&& generic, int64_t start, int64_t end) { \
255 utility::LogError( \
256 "Unsupported data type {} for calling " \
257 "vectorized kernel {}", \
258 typeid(generic).name(), \
259 OPEN3D_STRINGIFY(ISPCKernel)); \
260 })(T{}, start, end); \
261 }
262
263} // namespace core
264} // namespace open3d
Common CUDA utilities.
#define OPEN3D_GET_LAST_CUDA_ERROR(message)
Definition: CUDAUtils.h:48
#define LogError(...)
Definition: Logging.h:48
Definition: Device.h:18
bool IsCPU() const
Returns true iff device type is CPU.
Definition: Device.h:46
std::string ToString() const
Returns string representation of device, e.g. "CPU:0", "CUDA:0".
Definition: Device.cpp:88
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
Definition: ParallelFor.h:73
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:103
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Definition: Parallel.cpp:31
Definition: PinholeCameraIntrinsic.cpp:16