Open3D (C++ API)  0.16.0
ParallelFor.h
Go to the documentation of this file.
1// ----------------------------------------------------------------------------
2// - Open3D: www.open3d.org -
3// ----------------------------------------------------------------------------
4// The MIT License (MIT)
5//
6// Copyright (c) 2018-2021 www.open3d.org
7//
8// Permission is hereby granted, free of charge, to any person obtaining a copy
9// of this software and associated documentation files (the "Software"), to deal
10// in the Software without restriction, including without limitation the rights
11// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12// copies of the Software, and to permit persons to whom the Software is
13// furnished to do so, subject to the following conditions:
14//
15// The above copyright notice and this permission notice shall be included in
16// all copies or substantial portions of the Software.
17//
18// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24// IN THE SOFTWARE.
25// ----------------------------------------------------------------------------
26
27#pragma once
28
29#include <cstdint>
30#include <type_traits>
31
32#include "open3d/core/Device.h"
37
38#ifdef __CUDACC__
39#include <cuda.h>
40#include <cuda_runtime.h>
41
43#endif
44
45namespace open3d {
46namespace core {
47
48#ifdef __CUDACC__
49
50static constexpr int64_t OPEN3D_PARFOR_BLOCK = 128;
51static constexpr int64_t OPEN3D_PARFOR_THREAD = 4;
52
54template <int64_t block_size, int64_t thread_size, typename func_t>
55__global__ void ElementWiseKernel_(int64_t n, func_t f) {
56 int64_t items_per_block = block_size * thread_size;
57 int64_t idx = blockIdx.x * items_per_block + threadIdx.x;
58#pragma unroll
59 for (int64_t i = 0; i < thread_size; ++i) {
60 if (idx < n) {
61 f(idx);
62 idx += block_size;
63 }
64 }
65}
66
68template <typename func_t>
69void ParallelForCUDA_(const Device& device, int64_t n, const func_t& func) {
70 if (device.GetType() != Device::DeviceType::CUDA) {
71 utility::LogError("ParallelFor for CUDA cannot run on device {}.",
72 device.ToString());
73 }
74 if (n == 0) {
75 return;
76 }
77
78 CUDAScopedDevice scoped_device(device);
79 int64_t items_per_block = OPEN3D_PARFOR_BLOCK * OPEN3D_PARFOR_THREAD;
80 int64_t grid_size = (n + items_per_block - 1) / items_per_block;
81
82 ElementWiseKernel_<OPEN3D_PARFOR_BLOCK, OPEN3D_PARFOR_THREAD>
83 <<<grid_size, OPEN3D_PARFOR_BLOCK, 0, core::cuda::GetStream()>>>(
84 n, func);
85 OPEN3D_GET_LAST_CUDA_ERROR("ParallelFor failed.");
86}
87
88#else
89
91template <typename func_t>
92void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {
93 if (!device.IsCPU()) {
94 utility::LogError("ParallelFor for CPU cannot run on device {}.",
95 device.ToString());
96 }
97 if (n == 0) {
98 return;
99 }
100
101#pragma omp parallel for num_threads(utility::EstimateMaxThreads())
102 for (int64_t i = 0; i < n; ++i) {
103 func(i);
104 }
105}
106
107#endif
108
121template <typename func_t>
122void ParallelFor(const Device& device, int64_t n, const func_t& func) {
123#ifdef __CUDACC__
124 ParallelForCUDA_(device, n, func);
125#else
126 ParallelForCPU_(device, n, func);
127#endif
128}
129
176template <typename vec_func_t, typename func_t>
177void ParallelFor(const Device& device,
178 int64_t n,
179 const func_t& func,
180 const vec_func_t& vec_func) {
181#ifdef BUILD_ISPC_MODULE
182
183#ifdef __CUDACC__
184 ParallelForCUDA_(device, n, func);
185#else
186 int num_threads = utility::EstimateMaxThreads();
187 ParallelForCPU_(device, num_threads, [&](int64_t i) {
188 int64_t start = n * i / num_threads;
189 int64_t end = std::min<int64_t>(n * (i + 1) / num_threads, n);
190 vec_func(start, end);
191 });
192#endif
193
194#else
195
196#ifdef __CUDACC__
197 ParallelForCUDA_(device, n, func);
198#else
199 ParallelForCPU_(device, n, func);
200#endif
201
202#endif
203}
204
205#ifdef BUILD_ISPC_MODULE
206
207// Internal helper macro.
208#define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
209 using namespace ispc; \
210 ISPCKernel(start, end, __VA_ARGS__);
211
212#else
213
214// Internal helper macro.
215#define OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, ...) \
216 utility::LogError( \
217 "ISPC module disabled. Unable to call vectorized kernel {}", \
218 OPEN3D_STRINGIFY(ISPCKernel));
219
220#endif
221
223#define OPEN3D_OVERLOADED_LAMBDA_(T, ISPCKernel, ...) \
224 [&](T, int64_t start, int64_t end) { \
225 OPEN3D_CALL_ISPC_KERNEL_( \
226 OPEN3D_CONCAT(ISPCKernel, OPEN3D_CONCAT(_, T)), start, end, \
227 __VA_ARGS__); \
228 }
229
239#define OPEN3D_VECTORIZED(ISPCKernel, ...) \
240 [&](int64_t start, int64_t end) { \
241 OPEN3D_CALL_ISPC_KERNEL_(ISPCKernel, start, end, __VA_ARGS__); \
242 }
243
257#define OPEN3D_TEMPLATE_VECTORIZED(T, ISPCKernel, ...) \
258 [&](int64_t start, int64_t end) { \
259 static_assert(std::is_arithmetic<T>::value, \
260 "Data type is not an arithmetic type"); \
261 utility::Overload( \
262 OPEN3D_OVERLOADED_LAMBDA_(bool, ISPCKernel, __VA_ARGS__), \
263 OPEN3D_OVERLOADED_LAMBDA_(uint8_t, ISPCKernel, __VA_ARGS__), \
264 OPEN3D_OVERLOADED_LAMBDA_(int8_t, ISPCKernel, __VA_ARGS__), \
265 OPEN3D_OVERLOADED_LAMBDA_(uint16_t, ISPCKernel, __VA_ARGS__), \
266 OPEN3D_OVERLOADED_LAMBDA_(int16_t, ISPCKernel, __VA_ARGS__), \
267 OPEN3D_OVERLOADED_LAMBDA_(uint32_t, ISPCKernel, __VA_ARGS__), \
268 OPEN3D_OVERLOADED_LAMBDA_(int32_t, ISPCKernel, __VA_ARGS__), \
269 OPEN3D_OVERLOADED_LAMBDA_(uint64_t, ISPCKernel, __VA_ARGS__), \
270 OPEN3D_OVERLOADED_LAMBDA_(int64_t, ISPCKernel, __VA_ARGS__), \
271 OPEN3D_OVERLOADED_LAMBDA_(float, ISPCKernel, __VA_ARGS__), \
272 OPEN3D_OVERLOADED_LAMBDA_(double, ISPCKernel, __VA_ARGS__), \
273 [&](auto&& generic, int64_t start, int64_t end) { \
274 utility::LogError( \
275 "Unsupported data type {} for calling " \
276 "vectorized kernel {}", \
277 typeid(generic).name(), \
278 OPEN3D_STRINGIFY(ISPCKernel)); \
279 })(T{}, start, end); \
280 }
281
282} // namespace core
283} // namespace open3d
Common CUDA utilities.
#define OPEN3D_GET_LAST_CUDA_ERROR(message)
Definition: CUDAUtils.h:67
#define LogError(...)
Definition: Logging.h:67
Definition: Device.h:37
bool IsCPU() const
Returns true iff device type is CPU.
Definition: Device.h:65
std::string ToString() const
Returns string representation of device, e.g. "CPU:0", "CUDA:0".
Definition: Device.cpp:107
void ParallelForCPU_(const Device &device, int64_t n, const func_t &func)
Run a function in parallel on CPU.
Definition: ParallelFor.h:92
void ParallelFor(const Device &device, int64_t n, const func_t &func)
Definition: ParallelFor.h:122
int EstimateMaxThreads()
Estimate the maximum number of threads to be used in a parallel region.
Definition: Parallel.cpp:50
Definition: PinholeCameraIntrinsic.cpp:35