30#ifdef DEAL_II_WITH_TBB
31# include <tbb/blocked_range.h>
32# include <tbb/parallel_for.h>
34# ifndef DEAL_II_TBB_WITH_ONEAPI
35# include <tbb/task_scheduler_init.h>
65 namespace MatrixFreeFunctions
67#if defined(DEAL_II_WITH_TBB) && !defined(DEAL_II_TBB_WITH_ONEAPI)
78 const unsigned int partition,
79 const TaskInfo & task_info)
83 , task_info(task_info)
87 const unsigned int partition,
88 const TaskInfo & task_info)
92 , task_info(task_info)
103 if (task_info.face_partition_data.empty() ==
false)
111 MFWorkerInterface * worker;
114 const TaskInfo & task_info;
121 const unsigned int partition,
122 const TaskInfo & task_info,
135 tbb::empty_task::spawn(*
dummy);
167 const unsigned int evens = task_info.partition_evens[
partition];
168 const unsigned int odds = task_info.partition_odds[
partition];
169 const unsigned int n_blocked_workers =
170 task_info.partition_n_blocked_workers[
partition];
171 const unsigned int n_workers =
172 task_info.partition_n_workers[
partition];
173 std::vector<CellWork *> worker(n_workers);
176 root->set_ref_count(evens + 1);
177 for (
unsigned int j = 0;
j < evens; ++
j)
179 worker[
j] =
new (
root->allocate_child())
181 task_info.partition_row_index[partition] + 2 *
j,
186 worker[
j]->set_ref_count(2);
192 worker[
j]->set_ref_count(1);
197 task_info.partition_row_index[partition] + 2 *
j +
206 worker[evens] =
new (worker[
j]->allocate_child())
208 task_info.partition_row_index[partition] +
212 tbb::task::spawn(*worker[evens]);
218 tbb::task::spawn(*child);
223 root->wait_for_all();
226 tbb::empty_task::spawn(*
dummy);
233 MFWorkerInterface &function;
235 const TaskInfo & task_info;
260 task_info.cell_partition_data[
partition] +
261 task_info.block_size * r.
begin();
264 task_info.cell_partition_data[partition + 1]);
267 if (task_info.face_partition_data.empty() ==
false)
274 MFWorkerInterface &worker;
275 const TaskInfo & task_info;
298 const unsigned int n_chunks =
299 (task_info.cell_partition_data[
partition + 1] -
300 task_info.cell_partition_data[
partition] + task_info.block_size -
302 task_info.block_size;
304 CellWork(worker, task_info, partition));
306 tbb::empty_task::spawn(*
dummy);
313 MFWorkerInterface &worker;
315 const TaskInfo & task_info;
328 , do_compress(do_compress)
334 if (do_compress ==
false)
335 worker.vector_update_ghosts_finish();
337 worker.vector_compress_start();
342 MFWorkerInterface &worker;
343 const bool do_compress;
360 funct.cell_loop_pre_range(
363 funct.vector_update_ghosts_start();
365#if defined(DEAL_II_WITH_TBB) && !defined(DEAL_II_TBB_WITH_ONEAPI)
375 std::vector<partition::PartitionWork *> worker(
n_workers);
381 for (
unsigned int j = 0;
j <
evens; ++
j)
385 worker[
j] =
new (
root->allocate_child())
386 partition::PartitionWork(
funct, 2 *
j, *
this,
false);
387 worker[
j]->set_ref_count(2);
395 partition::PartitionWork(
funct, 2 *
j, *
this,
false);
396 worker[
j]->set_ref_count(2);
398 new (worker[
j]->allocate_child())
405 partition::PartitionWork(
funct, 2 *
j + 1, *
this,
true);
411 worker[
evens] =
new (worker[
j]->allocate_child())
412 partition::PartitionWork(
funct,
416 tbb::task::spawn(*worker[
evens]);
422 tbb::task::spawn(*child);
427 root->wait_for_all();
435 funct.vector_update_ghosts_finish();
436 funct.vector_compress_start();
451 std::vector<color::PartitionWork *> worker(
n_workers);
459 for (
unsigned int part = 0;
466 color::PartitionWork(
funct,
472 color::PartitionWork(
funct,
484 color::PartitionWork(
funct,
521 color::PartitionWork(
funct,
531 color::PartitionWork(
funct,
553 color::PartitionWork(
funct,
566 tbb::task::spawn(*
final);
575 root->wait_for_all();
584 funct.vector_update_ghosts_finish();
591 root->set_ref_count(2);
592 color::PartitionWork *worker =
593 new (
root->allocate_child())
594 color::PartitionWork(
funct,
color, *
this,
false);
595 tbb::empty_task::spawn(*worker);
596 root->wait_for_all();
600 funct.vector_compress_start();
613 funct.vector_update_ghosts_finish();
619 funct.cell_loop_pre_range(i);
620 funct.zero_dst_vector_range(i);
635 funct.cell_loop_post_range(i);
639 funct.vector_compress_start();
642 funct.vector_compress_finish();
647 funct.cell_loop_post_range(
689 template <
typename StreamType>
700 out <<
" MB" << std::endl;
749 for (
unsigned int j =
797 const unsigned int dofs_per_cell,
800 const bool cell_vectorization_categories_strict,
851 const unsigned int index =
868 if (cell_vectorization_categories_strict ==
false &&
n_categories > 1)
900 std::vector<std::pair<unsigned int, unsigned int>>
grouped_cells;
912 unsigned int length = 0;
928 for (
unsigned int j = 0;
j < length; ++
j)
935 for (
unsigned int j = 0;
j < length; ++
j)
965 std::vector<std::array<unsigned int, 3>>
batch_order;
969 unsigned int max_index = 0;
970 for (
unsigned int j = 0;
j < n_lanes; ++
j)
978 const std::array<unsigned int, 3> next{{
category_hp, max_index, i}};
993 std::vector<unsigned int> blocks;
1036 const unsigned int index =
1044 std::array<unsigned int, 2>{{
index, i}});
1054 for (
unsigned int j =
counters[i] % n_lanes;
j < n_lanes; ++
j)
1056 std::array<unsigned int, 2>{
1057 {i, numbers::invalid_unsigned_int}});
1073 cell_partition_data.clear();
1074 cell_partition_data.resize(1, 0);
1077 renumbering.resize(n_active_cells + n_ghost_cells,
1080 unsigned int counter = 0;
1081 for (
unsigned int block = 0; block < blocks.size() - 1; ++block)
1084 std::max((2048U / dofs_per_cell) / 8 * 4, 2U);
1085 for (
unsigned int k = blocks[block];
k < blocks[block + 1];
1087 cell_partition_data.push_back(
1089 partition_row_index[block + 1] = cell_partition_data.size() - 1;
1092 for (
unsigned int k = blocks[block];
k < blocks[block + 1]; ++
k)
1109 for (
unsigned int cell = 0; cell < n_ghost_cells; ++cell)
1110 renumbering[n_active_cells + cell] = n_active_cells + cell;
1112 if ((n_ghost_cells % n_lanes) != 0
u)
1118 ++
k, ptr += n_lanes)
1138 partition_row_index.back() = cell_partition_data.size() - 1;
1151 TaskInfo::initial_setup_blocks_tasks(
1156 const unsigned int n_cell_batches =
1157 (n_active_cells + vectorization_length - 1) / vectorization_length;
1159 (n_ghost_cells + vectorization_length - 1) / vectorization_length;
1161 if (n_cell_batches * vectorization_length > n_active_cells)
1163 vectorization_length -
1164 (n_cell_batches * vectorization_length - n_active_cells);
1167 vectorization_length -
1175 for (
unsigned int j = 0;
j < n_active_cells; ++
j)
1182 for (
unsigned int j = n_active_cells;
j < n_active_cells + n_ghost_cells;
1188 cell_partition_data.clear();
1189 cell_partition_data.push_back(0);
1194 vectorization_length;
1195 cell_partition_data.push_back(
1197 cell_partition_data.push_back(cell_partition_data[1] +
1202 cell_partition_data.push_back(n_cell_batches);
1203 cell_partition_data.push_back(cell_partition_data.back() +
n_ghost_slots);
1204 partition_row_index.resize(n_procs > 1 ? 4 : 2);
1205 partition_row_index[0] = 0;
1206 partition_row_index[1] = 1;
1209 partition_row_index[2] = 2;
1210 partition_row_index[3] = 3;
1217 TaskInfo::guess_block_size(
const unsigned int dofs_per_cell)
1220 if (block_size == 0)
1225 vectorization_length);
1229 const unsigned int minimum_parallel_grain_size = 200;
1230 if (dofs_per_cell * block_size < minimum_parallel_grain_size)
1231 block_size = (minimum_parallel_grain_size / dofs_per_cell + 1);
1232 if (dofs_per_cell * block_size > 10000)
1236 1 <<
static_cast<unsigned int>(std::log2(block_size + 1));
1238 if (block_size > n_active_cells)
1239 block_size =
std::max(1U, n_active_cells);
1245 TaskInfo::make_thread_graph_partition_color(
1251 const unsigned int n_cell_batches = *(cell_partition_data.end() - 2);
1252 if (n_cell_batches == 0)
1257 unsigned int partition = 0, counter = 0;
1289 make_partitioning(connectivity,
1297 make_coloring_within_partitions_pre_blocked(connectivity,
1318 std::vector<unsigned int> block_start(n_cell_batches + 1);
1319 std::vector<unsigned char>
irregular(n_cell_batches);
1323 for (
unsigned int block = 0; block < n_blocks; ++block)
1325 block_start[block + 1] = block_start[block];
1332 vectorization_length;
1333 block_start[block + 1] +=
n_comp;
1341 n_cell_batches - block_size * (n_blocks - 1);
1345 unsigned int tick = 0;
1346 for (
unsigned int block = 0; block < n_blocks; ++block)
1358 if (cell_partition_data[
tick] == block)
1392 TaskInfo::make_thread_graph(
1393 const std::vector<unsigned int> &cell_active_fe_index,
1399 const unsigned int n_cell_batches = *(cell_partition_data.end() - 2);
1400 if (n_cell_batches == 0)
1412 unsigned int n_blocks = 0;
1413 if (scheme == partition_color ||
1415 n_blocks = this->n_blocks;
1417 n_blocks = n_active_cells;
1434 unsigned int partition = 0;
1441 if (scheme == partition_partition)
1445 if (scheme == partition_color || scheme == color)
1453 make_partitioning(connectivity,
1461 if (scheme == partition_partition)
1465 make_partitioning_within_partitions_post_blocked(
1467 cell_active_fe_index,
1477 else if (scheme == partition_color || scheme == color)
1500 if (scheme == partition_partition)
1508 for (
unsigned int i = 0; i < n_ghost_cells; ++i)
1515 std::vector<unsigned int> block_start(n_cell_batches + 1);
1516 std::vector<unsigned char>
irregular(n_cell_batches);
1518 unsigned int counter = 0;
1521 for (
unsigned int block = 0; block < n_blocks; ++block)
1523 block_start[block + 1] = block_start[block];
1530 vectorization_length;
1531 block_start[block + 1] +=
n_comp;
1539 n_cell_batches - block_size * (n_blocks - 1);
1543 unsigned int tick = 0;
1544 for (
unsigned int block = 0; block < n_blocks; ++block)
1556 if (cell_partition_data[
tick] == block)
1581 update_task_info(partition);
1587 TaskInfo::make_thread_graph_partition_partition(
1588 const std::vector<unsigned int> &cell_active_fe_index,
1594 const unsigned int n_cell_batches = *(cell_partition_data.end() - 2);
1595 if (n_cell_batches == 0)
1598 const unsigned int cluster_size = block_size * vectorization_length;
1617 unsigned int partition = 0;
1622 make_partitioning(connectivity,
1630 make_partitioning_within_partitions_post_blocked(connectivity,
1631 cell_active_fe_index,
1646 for (
unsigned int i = 0; i < n_ghost_cells; ++i)
1649 update_task_info(partition);
1655 TaskInfo::make_connectivity_cells_to_blocks(
1660 std::vector<std::vector<unsigned int>>
cell_blocks(n_blocks);
1662 unsigned int cell = 0;
1663 for (
unsigned int i = 0,
mcell = 0; i < n_blocks; ++i)
1665 for (
unsigned int c = 0;
1666 c < block_size &&
mcell < *(cell_partition_data.end() - 2);
1671 vectorization_length;
1672 for (
unsigned int c = 0; c <
ncomp; ++c, ++cell)
1699 TaskInfo::make_partitioning_within_partitions_post_blocked(
1701 const std::vector<unsigned int> &cell_active_fe_index,
1702 const unsigned int partition,
1711 const unsigned int n_cell_batches = *(cell_partition_data.end() - 2);
1713 *(cell_partition_data.end() - 1) - n_cell_batches;
1721 std::vector<unsigned int>
renumbering(n_active_cells);
1726 unsigned int max_fe_index = 0;
1727 for (
const unsigned int fe_index : cell_active_fe_index)
1728 max_fe_index =
std::max(fe_index, max_fe_index);
1742 partition_row_index.clear();
1743 partition_row_index.resize(partition + 1, 0);
1744 cell_partition_data.resize(1, 0);
1746 unsigned int counter = 0;
1795 const auto end_it = connectivity.
end(neighbor);
1823 std::vector<std::vector<unsigned int>>
1835 [cell_active_fe_index.empty() ?
1837 cell_active_fe_index
1842 for (
unsigned int j = 0;
j < max_fe_index + 1; ++
j)
1846 vectorization_length;
1851 vectorization_length - 1) /
1852 vectorization_length);
1901 for (; neighbor != end; ++neighbor)
1910 cell_active_fe_index.
empty() ?
1912 cell_active_fe_index[neighbor
1925 neighbor->column());
1928 .push_back(neighbor->column());
1940 vectorization_length)
1948 for (
unsigned int fe_ind = 0;
1949 fe_ind < max_fe_index + 1;
1967 for (
unsigned int j = 0;
j < max_fe_index + 1; ++
j)
1969 for (
const unsigned int jj :
1973 vectorization_length !=
1976 vectorization_length +
1979 vectorization_length;
1982 vectorization_length - 1) /
1983 vectorization_length;
2005 partition_row_index[
part + 1] =
2020 TaskInfo::make_coloring_within_partitions_pre_blocked(
2022 const unsigned int partition,
2028 const unsigned int n_cell_batches = *(cell_partition_data.end() - 2);
2029 std::vector<unsigned int>
cell_color(n_blocks, n_cell_batches);
2032 partition_row_index.resize(partition + 1);
2033 cell_partition_data.clear();
2052 connectivity.
begin(cell),
2053 end = connectivity.
end(cell);
2054 for (; neighbor != end; ++neighbor)
2072 for (
unsigned int color = 0; color <=
max_color; ++color)
2088 cell_partition_data.push_back(n_blocks);
2101 unsigned int & partition)
const
2123 unsigned int counter = 0;
2125 cell_partition_data.
size() == 5 ?
2126 vectorization_length *
2127 (cell_partition_data[2] - cell_partition_data[1]) :
2130 const unsigned int n_cell_batches = *(cell_partition_data.end() - 2);
2131 if (n_cell_batches == 0)
2133 if (scheme == color)
2135 if (scheme == partition_color ||
2138 unsigned int n_blocks;
2139 if (scheme == partition_color ||
2141 n_blocks = this->n_blocks;
2143 n_blocks = n_active_cells;
2163 const unsigned int cell_nn = cell;
2208 for (; neighbor != end; ++neighbor)
2240 auto neighbor = connectivity.
begin(cell);
2241 const auto end = connectivity.
end(cell);
2242 for (; neighbor != end; ++neighbor)
2282 end = connectivity.
end(
2284 for (; neighbor != end; ++neighbor)
2307 for (
unsigned int j =
start_up;
j < n_blocks; ++
j)
2325 TaskInfo::update_task_info(
const unsigned int partition)
2327 evens = (partition + 1) / 2;
2328 odds = partition / 2;
2329 n_blocked_workers = odds - (odds + evens + 1) % 2;
2330 n_workers = evens + odds - n_blocked_workers;
2332 partition_evens.resize(partition);
2333 partition_odds.resize(partition);
2334 partition_n_blocked_workers.resize(partition);
2335 partition_n_workers.resize(partition);
2338 partition_evens[
part] =
2339 (partition_row_index[
part + 1] - partition_row_index[
part] + 1) / 2;
2340 partition_odds[
part] =
2341 (partition_row_index[
part + 1] - partition_row_index[
part]) / 2;
2342 partition_n_blocked_workers[
part] =
2343 partition_odds[
part] -
2344 (partition_odds[
part] + partition_evens[
part] + 1) % 2;
2345 partition_n_workers[
part] = partition_evens[
part] +
2346 partition_odds[
part] -
2347 partition_n_blocked_workers[
part];
2357internal::MatrixFreeFunctions::TaskInfo::print_memory_statistics<std::ostream>(
2359 const std::size_t)
const;
size_type row_length(const size_type row) const
static unsigned int n_threads()
#define DEAL_II_NAMESPACE_OPEN
#define DEAL_II_NAMESPACE_CLOSE
static ::ExceptionBase & ExcNotImplemented()
#define Assert(cond, exc)
#define AssertDimension(dim1, dim2)
#define AssertIndexRange(index, range)
static ::ExceptionBase & ExcInternalError()
#define AssertThrow(cond, exc)
std::enable_if_t< std::is_fundamental< T >::value, std::size_t > memory_consumption(const T &t)
MinMaxAvg min_max_avg(const double my_value, const MPI_Comm mpi_communicator)
std::vector< Integer > invert_permutation(const std::vector< Integer > &permutation)
unsigned int indicate_power_of_two(const unsigned int vectorization_length)
static const unsigned int invalid_unsigned_int
void parallel_for(Iterator x_begin, Iterator x_end, const Functor &functor, const unsigned int grainsize)
::VectorizedArray< Number, width > min(const ::VectorizedArray< Number, width > &, const ::VectorizedArray< Number, width > &)
::VectorizedArray< Number, width > max(const ::VectorizedArray< Number, width > &, const ::VectorizedArray< Number, width > &)
unsigned int n_ghost_cells
std::size_t memory_consumption() const
std::vector< unsigned int > boundary_partition_data
void loop(MFWorkerInterface &worker) const
std::vector< unsigned int > partition_n_workers
unsigned int vectorization_length
void create_blocks_serial(const std::vector< unsigned int > &cells_with_comm, const unsigned int dofs_per_cell, const bool categories_are_hp, const std::vector< unsigned int > &cell_vectorization_categories, const bool cell_vectorization_categories_strict, const std::vector< unsigned int > &parent_relation, std::vector< unsigned int > &renumbering, std::vector< unsigned char > &incompletely_filled_vectorization)
unsigned int n_active_cells
void print_memory_statistics(StreamType &out, std::size_t data_length) const
std::vector< unsigned int > partition_row_index
std::vector< unsigned int > partition_evens
unsigned int n_blocked_workers
std::vector< unsigned int > partition_n_blocked_workers
std::vector< unsigned int > cell_partition_data
void make_boundary_cells_divisible(std::vector< unsigned int > &boundary_cells)
TasksParallelScheme scheme
std::vector< unsigned int > partition_odds
std::vector< unsigned int > face_partition_data