758 *
return 1. / (0.05 + 2. * p.square());
765 *
const unsigned int component)
const
774 * <a name=
"Matrixfreeimplementation"></a>
867 *
class LaplaceOperator
872 *
using value_type = number;
876 *
void clear()
override;
883 *
virtual void apply_add(
891 *
const std::pair<unsigned int, unsigned int> &
cell_range)
const;
896 *
const unsigned int &
dummy,
897 *
const std::pair<unsigned int, unsigned int> &
cell_range)
const;
910 *
e.g. in a preconditioner.
913 *
template <
int dim,
int fe_degree,
typename number>
914 *
LaplaceOperator<dim, fe_degree, number>::LaplaceOperator()
921 *
template <
int dim,
int fe_degree,
typename number>
922 *
void LaplaceOperator<dim, fe_degree, number>::clear()
934 * <a name=
"Computationofcoefficient"></a>
947 *
void LaplaceOperator<dim, fe_degree, number>::evaluate_coefficient(
950 *
const unsigned int n_cells = this->
data->n_cell_batches();
954 *
for (
unsigned int cell = 0; cell <
n_cells; ++cell)
957 *
for (
unsigned int q = 0;
q <
phi.n_q_points; ++
q)
968 * <a name=
"LocalevaluationofLaplaceoperator"></a>
990 * cell iterators, in
this class all cells
are laid out in a
plain array
1068 *
template <
int dim,
int fe_degree,
typename number>
1069 *
void LaplaceOperator<dim, fe_degree, number>::local_apply(
1073 *
const std::pair<unsigned int, unsigned int> &
cell_range)
const
1083 *
phi.read_dof_values(src);
1085 *
for (
unsigned int q = 0;
q <
phi.n_q_points; ++
q)
1088 *
phi.distribute_local_to_global(dst);
1107 * src.update_ghost_values();
1109 *
data.n_cell_batches()));
1185 *
void LaplaceOperator<dim, fe_degree, number>::apply_add(
1189 *
this->data->cell_loop(&LaplaceOperator::local_apply,
this, dst, src);
1236 *
void LaplaceOperator<dim, fe_degree, number>::compute_diagonal()
1243 *
unsigned int dummy = 0;
1244 *
this->data->cell_loop(&LaplaceOperator::local_compute_diagonal,
1254 *
ExcMessage(
"No diagonal entry in a positive definite operator "
1255 *
"should be zero"));
1311 *
href=
"http://dx.doi.org/10.4208/cicp.101214.021015a">
Kormann (2016),
1317 *
template <
int dim,
int fe_degree,
typename number>
1318 *
void LaplaceOperator<dim, fe_degree, number>::local_compute_diagonal(
1321 *
const unsigned int &,
1322 *
const std::pair<unsigned int, unsigned int> &
cell_range)
const
1334 *
for (
unsigned int i = 0; i <
phi.dofs_per_cell; ++i)
1336 *
for (
unsigned int j = 0;
j <
phi.dofs_per_cell; ++
j)
1341 *
for (
unsigned int q = 0;
q <
phi.n_q_points; ++
q)
1347 *
for (
unsigned int i = 0; i <
phi.dofs_per_cell; ++i)
1348 *
phi.submit_dof_value(diagonal[i], i);
1349 *
phi.distribute_local_to_global(dst);
1358 * <a name=
"LaplaceProblemclass"></a>
1388 *
template <
int dim>
1444 *
template <
int dim>
1450 *
dim>::construct_multigrid_hierarchy)
1477 * <a name=
"LaplaceProblemsetup_system"></a>
1490 * @
ref step_40
"step-40".
1514 *
template <
int dim>
1520 *
system_matrix.clear();
1523 *
dof_handler.distribute_dofs(fe);
1524 *
dof_handler.distribute_mg_dofs();
1526 *
pcout <<
"Number of degrees of freedom: " << dof_handler.n_dofs()
1532 *
constraints.clear();
1537 *
constraints.close();
1539 *
time_details <<
"Distribute DoFs & B.C. (CPU/wall) " << time.cpu_time()
1540 *
<<
"s/" << time.wall_time() <<
's' << std::endl;
1545 *
additional_data.tasks_parallel_scheme =
1547 *
additional_data.mapping_update_flags =
1561 *
system_matrix.initialize_dof_vector(solution);
1562 *
system_matrix.initialize_dof_vector(
system_rhs);
1565 *
time_details <<
"Setup matrix-free system (CPU/wall) " << time.cpu_time()
1566 *
<<
"s/" << time.wall_time() <<
's' << std::endl;
1586 *
mg_constrained_dofs.initialize(dof_handler);
1587 *
mg_constrained_dofs.make_zero_boundary_constraints(dof_handler,
1596 *
level_constraints.add_lines(
1597 *
mg_constrained_dofs.get_boundary_indices(
level));
1598 *
level_constraints.close();
1601 *
additional_data.tasks_parallel_scheme =
1603 *
additional_data.mapping_update_flags =
1605 *
additional_data.mg_level =
level;
1610 *
level_constraints,
1615 *
mg_constrained_dofs,
1620 *
time_details <<
"Setup matrix-free levels (CPU/wall) " << time.cpu_time()
1621 *
<<
"s/" << time.wall_time() <<
's' << std::endl;
1629 * <a name=
"LaplaceProblemassemble_rhs"></a>
1651 *
*system_matrix.get_matrix_free());
1652 *
for (
unsigned int cell = 0;
1653 *
cell < system_matrix.get_matrix_free()->n_cell_batches();
1657 *
for (
unsigned int q = 0;
q <
phi.n_q_points; ++
q)
1665 *
time_details <<
"Assemble right hand side (CPU/wall) " << time.cpu_time()
1666 *
<<
"s/" << time.wall_time() <<
's' << std::endl;
1674 * <a name=
"LaplaceProblemsolve"></a>
1675 * <
h4>LaplaceProblem::solve</
h4>
1686 *
template <
int dim>
1693 *
time_details <<
"MG build transfer time (CPU/wall) " << time.cpu_time()
1694 *
<<
"s/" << time.wall_time() <<
"s\n";
1716 *
by our LaplaceOperator
class.
1800 *
"Multigrid paper by Janssen and Kanschat" for more details.
1859 *
<< "s/" << time.wall_time() << "s\n";
1864 *
constraints.set_zero(solution);
1865 *
cg.solve(system_matrix, solution,
system_rhs, preconditioner);
1867 *
constraints.distribute(solution);
1870 *
<< (solver_control.last_step() < 10 ? " " : " ") << "(
CPU/
wall) "
1871 *
<< time.cpu_time() << "s/" << time.wall_time() << "s\n";
1892 *
of the linear solve.
1911 *
solution.update_ghost_values();
1912 *
data_out.attach_dof_handler(dof_handler);
1913 *
data_out.add_data_vector(solution,
"solution");
1914 *
data_out.build_patches(mapping);
1918 *
data_out.set_flags(flags);
1919 *
data_out.write_vtu_with_pvtu_record(
1922 *
time_details <<
"Time write output (CPU/wall) " << time.cpu_time()
1923 *
<<
"s/" << time.wall_time() <<
"s\n";
1931 * <a name=
"LaplaceProblemrun"></a>
1932 * <
h4>LaplaceProblem::run</
h4>
1942 * Before we run the program, we output some information about the detected
1943 * vectorization level as discussed in the introduction.
1946 * template <int dim>
1947 * void LaplaceProblem<dim>::run()
1950 * const unsigned int n_vect_doubles = VectorizedArray<double>::size();
1951 * const unsigned int n_vect_bits = 8 * sizeof(double) * n_vect_doubles;
1953 * pcout << "Vectorization over " << n_vect_doubles
1954 * << " doubles = " << n_vect_bits << " bits ("
1955 * << Utilities::System::get_current_vectorization_level() << ')
'
1959 * for (unsigned int cycle = 0; cycle < 9 - dim; ++cycle)
1961 * pcout << "Cycle " << cycle << std::endl;
1965 * GridGenerator::hyper_cube(triangulation, 0., 1.);
1966 * triangulation.refine_global(3 - dim);
1968 * triangulation.refine_global(1);
1972 * output_results(cycle);
1973 * pcout << std::endl;
1976 * } // namespace Step37
1983 * <a name="Thecodemaincodefunction"></a>
1984 * <h3>The <code>main</code> function</h3>
1988 * Apart from the fact that we set up the MPI framework according to @ref step_40 "step-40",
1989 * there are no surprises in the main function.
1992 * int main(int argc, char *argv[])
1996 * using namespace Step37;
1998 * Utilities::MPI::MPI_InitFinalize mpi_init(argc, argv, 1);
2000 * LaplaceProblem<dimension> laplace_problem;
2001 * laplace_problem.run();
2003 * catch (std::exception &exc)
2005 * std::cerr << std::endl
2007 * << "----------------------------------------------------"
2009 * std::cerr << "Exception on processing: " << std::endl
2010 * << exc.what() << std::endl
2011 * << "Aborting!" << std::endl
2012 * << "----------------------------------------------------"
2018 * std::cerr << std::endl
2020 * << "----------------------------------------------------"
2022 * std::cerr << "Unknown exception!" << std::endl
2023 * << "Aborting!" << std::endl
2024 * << "----------------------------------------------------"
2032<a name="Results"></a><h1>Results</h1>
2035<a name="Programoutput"></a><h3>Program output</h3>
2038Since this example solves the same problem as @ref step_5 "step-5" (except for
2039a different coefficient), there is little to say about the
2040solution. We show a picture anyway, illustrating the size of the
2041solution through both isocontours and volume rendering:
2043<img src="https://www.dealii.org/images/steps/developer/step-37.solution.png" alt="">
2045Of more interest is to evaluate some aspects of the multigrid solver.
2046When we run this program in 2D for quadratic (@f$Q_2@f$) elements, we get the
2047following output (when run on one core in release mode):
2049Vectorization over 2 doubles = 128 bits (SSE2)
2051Number of degrees of freedom: 81
2052Total setup time (wall) 0.00159788s
2053Time solve (6 iterations) (CPU/wall) 0.000951s/0.000951052s
2056Number of degrees of freedom: 289
2057Total setup time (wall) 0.00114608s
2058Time solve (6 iterations) (CPU/wall) 0.000935s/0.000934839s
2061Number of degrees of freedom: 1089
2062Total setup time (wall) 0.00244665s
2063Time solve (6 iterations) (CPU/wall) 0.00207s/0.002069s
2066Number of degrees of freedom: 4225
2067Total setup time (wall) 0.00678205s
2068Time solve (6 iterations) (CPU/wall) 0.005616s/0.00561595s
2071Number of degrees of freedom: 16641
2072Total setup time (wall) 0.0241671s
2073Time solve (6 iterations) (CPU/wall) 0.019543s/0.0195441s
2076Number of degrees of freedom: 66049
2077Total setup time (wall) 0.0967851s
2078Time solve (6 iterations) (CPU/wall) 0.07457s/0.0745709s
2081Number of degrees of freedom: 263169
2082Total setup time (wall) 0.346374s
2083Time solve (6 iterations) (CPU/wall) 0.260042s/0.265033s
2086As in @ref step_16 "step-16", we see that the number of CG iterations remains constant with
2087increasing number of degrees of freedom. A constant number of iterations
2088(together with optimal computational properties) means that the computing time
2089approximately quadruples as the problem size quadruples from one cycle to the
2090next. The code is also very efficient in terms of storage. Around 2-4 million
2091degrees of freedom fit into 1 GB of memory, see also the MPI results below. An
2092interesting fact is that solving one linear system is cheaper than the setup,
2093despite not building a matrix (approximately half of which is spent in the
2094DoFHandler::distribute_dofs() and DoFHandler::distribute_mg_dofs()
2095calls). This shows the high efficiency of this approach, but also that the
2096deal.II data structures are quite expensive to set up and the setup cost must
2097be amortized over several system solves.
2099Not much changes if we run the program in three spatial dimensions. Since we
2100use uniform mesh refinement, we get eight times as many elements and
2101approximately eight times as many degrees of freedom with each cycle:
2104Vectorization over 2 doubles = 128 bits (SSE2)
2106Number of degrees of freedom: 125
2107Total setup time (wall) 0.00231099s
2108Time solve (6 iterations) (CPU/wall) 0.000692s/0.000922918s
2111Number of degrees of freedom: 729
2112Total setup time (wall) 0.00289083s
2113Time solve (6 iterations) (CPU/wall) 0.001534s/0.0024128s
2116Number of degrees of freedom: 4913
2117Total setup time (wall) 0.0143182s
2118Time solve (6 iterations) (CPU/wall) 0.010785s/0.0107841s
2121Number of degrees of freedom: 35937
2122Total setup time (wall) 0.087064s
2123Time solve (6 iterations) (CPU/wall) 0.063522s/0.06545s
2126Number of degrees of freedom: 274625
2127Total setup time (wall) 0.596306s
2128Time solve (6 iterations) (CPU/wall) 0.427757s/0.431765s
2131Number of degrees of freedom: 2146689
2132Total setup time (wall) 4.96491s
2133Time solve (6 iterations) (CPU/wall) 3.53126s/3.56142s
2136Since it is so easy, we look at what happens if we increase the polynomial
2137degree. When selecting the degree as four in 3D, i.e., on @f$\mathcal Q_4@f$
2138elements, by changing the line <code>const unsigned int
2139degree_finite_element=4;</code> at the top of the program, we get the
2140following program output:
2143Vectorization over 2 doubles = 128 bits (SSE2)
2145Number of degrees of freedom: 729
2146Total setup time (wall) 0.00633097s
2147Time solve (6 iterations) (CPU/wall) 0.002829s/0.00379395s
2150Number of degrees of freedom: 4913
2151Total setup time (wall) 0.0174279s
2152Time solve (6 iterations) (CPU/wall) 0.012255s/0.012254s
2155Number of degrees of freedom: 35937
2156Total setup time (wall) 0.082655s
2157Time solve (6 iterations) (CPU/wall) 0.052362s/0.0523629s
2160Number of degrees of freedom: 274625
2161Total setup time (wall) 0.507943s
2162Time solve (6 iterations) (CPU/wall) 0.341811s/0.345788s
2165Number of degrees of freedom: 2146689
2166Total setup time (wall) 3.46251s
2167Time solve (7 iterations) (CPU/wall) 3.29638s/3.3265s
2170Number of degrees of freedom: 16974593
2171Total setup time (wall) 27.8989s
2172Time solve (7 iterations) (CPU/wall) 26.3705s/27.1077s
2175Since @f$\mathcal Q_4@f$ elements on a certain mesh correspond to @f$\mathcal Q_2@f$
2176elements on half the mesh size, we can compare the run time at cycle 4 with
2177fourth degree polynomials with cycle 5 using quadratic polynomials, both at
21782.1 million degrees of freedom. The surprising effect is that the solver for
2179@f$\mathcal Q_4@f$ element is actually slightly faster than for the quadratic
2180case, despite using one more linear iteration. The effect that higher-degree
2181polynomials are similarly fast or even faster than lower degree ones is one of
2182the main strengths of matrix-free operator evaluation through sum
2183factorization, see the <a
2184href="http://dx.doi.org/10.1016/j.compfluid.2012.04.012">matrix-free
2185paper</a>. This is fundamentally different to matrix-based methods that get
2186more expensive per unknown as the polynomial degree increases and the coupling
2189In addition, also the setup gets a bit cheaper for higher order, which is
2190because fewer elements need to be set up.
2192Finally, let us look at the timings with degree 8, which corresponds to
2193another round of mesh refinement in the lower order methods:
2196Vectorization over 2 doubles = 128 bits (SSE2)
2198Number of degrees of freedom: 4913
2199Total setup time (wall) 0.0842004s
2200Time solve (8 iterations) (CPU/wall) 0.019296s/0.0192959s
2203Number of degrees of freedom: 35937
2204Total setup time (wall) 0.327048s
2205Time solve (8 iterations) (CPU/wall) 0.07517s/0.075999s
2208Number of degrees of freedom: 274625
2209Total setup time (wall) 2.12335s
2210Time solve (8 iterations) (CPU/wall) 0.448739s/0.453698s
2213Number of degrees of freedom: 2146689
2214Total setup time (wall) 16.1743s
2215Time solve (8 iterations) (CPU/wall) 3.95003s/3.97717s
2218Number of degrees of freedom: 16974593
2219Total setup time (wall) 130.8s
2220Time solve (8 iterations) (CPU/wall) 31.0316s/31.767s
2223Here, the initialization seems considerably slower than before, which is
2224mainly due to the computation of the diagonal of the matrix, which actually
2225computes a 729 x 729 matrix on each cell and throws away everything but the
2226diagonal. The solver times, however, are again very close to the quartic case,
2227showing that the linear increase with the polynomial degree that is
2228theoretically expected is almost completely offset by better computational
2229characteristics and the fact that higher order methods have a smaller share of
2230degrees of freedom living on several cells that add to the evaluation
2233<a name="Comparisonwithasparsematrix"></a><h3>Comparison with a sparse matrix</h3>
2236In order to understand the capabilities of the matrix-free implementation, we
2237compare the performance of the 3d example above with a sparse matrix
2238implementation based on TrilinosWrappers::SparseMatrix by measuring both the
2239computation times for the initialization of the problem (distribute DoFs,
2240setup and assemble matrices, setup multigrid structures) and the actual
2241solution for the matrix-free variant and the variant based on sparse
2242matrices. We base the preconditioner on float numbers and the actual matrix
2243and vectors on double numbers, as shown above. Tests are run on an Intel Core
2244i7-5500U notebook processor (two cores and <a
2245href="http://en.wikipedia.org/wiki/Advanced_Vector_Extensions">AVX</a>
2246support, i.e., four operations on doubles can be done with one CPU
2247instruction, which is heavily used in FEEvaluation), optimized mode, and two
2250<table align="center" class="doxtable">
2253 <th colspan="2">Sparse matrix</th>
2254 <th colspan="2">Matrix-free implementation</th>
2258 <th>Setup + assemble</th>
2259 <th> Solve </th>
2260 <th>Setup + assemble</th>
2261 <th> Solve </th>
2264 <td align="right">125</td>
2265 <td align="center">0.0042s</td>
2266 <td align="center">0.0012s</td>
2267 <td align="center">0.0022s</td>
2268 <td align="center">0.00095s</td>
2271 <td align="right">729</td>
2272 <td align="center">0.012s</td>
2273 <td align="center">0.0040s</td>
2274 <td align="center">0.0027s</td>
2275 <td align="center">0.0021s</td>
2278 <td align="right">4,913</td>
2279 <td align="center">0.082s</td>
2280 <td align="center">0.012s</td>
2281 <td align="center">0.011s</td>
2282 <td align="center">0.0057s</td>
2285 <td align="right">35,937</td>
2286 <td align="center">0.73s</td>
2287 <td align="center">0.13s</td>
2288 <td align="center">0.048s</td>
2289 <td align="center">0.040s</td>
2292 <td align="right">274,625</td>
2293 <td align="center">5.43s</td>
2294 <td align="center">1.01s</td>
2295 <td align="center">0.33s</td>
2296 <td align="center">0.25s</td>
2299 <td align="right">2,146,689</td>
2300 <td align="center">43.8s</td>
2301 <td align="center">8.24s</td>
2302 <td align="center">2.42s</td>
2303 <td align="center">2.06s</td>
2307The table clearly shows that the matrix-free implementation is more than twice
2308as fast for the solver, and more than six times as fast when it comes to
2309initialization costs. As the problem size is made a factor 8 larger, we note
2310that the times usually go up by a factor eight, too (as the solver iterations
2311are constant at six). The main deviation is in the sparse matrix between 5k
2312and 36k degrees of freedom, where the time increases by a factor 12. This is
2313the threshold where the (L3) cache in the processor can no longer hold all
2314data necessary for the matrix-vector products and all matrix elements must be
2315fetched from main memory.
2317Of course, this picture does not necessarily translate to all cases, as there
2318are problems where knowledge of matrix entries enables much better solvers (as
2319happens when the coefficient is varying more strongly than in the above
2320example). Moreover, it also depends on the computer system. The present system
2321has good memory performance, so sparse matrices perform comparably
2322well. Nonetheless, the matrix-free implementation gives a nice speedup already
2323for the <i>Q</i><sub>2</sub> elements used in this example. This becomes
2324particularly apparent for time-dependent or nonlinear problems where sparse
2325matrices would need to be reassembled over and over again, which becomes much
2326easier with this class. And of course, thanks to the better complexity of the
2327products, the method gains increasingly larger advantages when the order of the
2328elements increases (the matrix-free implementation has costs
23294<i>d</i><sup>2</sup><i>p</i> per degree of freedom, compared to
23302<i>p<sup>d</sup></i> for the sparse matrix, so it will win anyway for order 4
2333<a name="ResultsforlargescaleparallelcomputationsonSuperMUC"></a><h3> Results for large-scale parallel computations on SuperMUC</h3>
2336As explained in the introduction and the in-code comments, this program can be
2337run in parallel with MPI. It turns out that geometric multigrid schemes work
2338really well and can scale to very large machines. To the authors' knowledge,
2341href=
"https://www.lrz.de/services/compute/supermuc/systemdescription/">
complete
2384<
img src=
"https://www.dealii.org/images/steps/developer/step-37.scaling_strong.png" alt=
"">
2416<
img src=
"https://www.dealii.org/images/steps/developer/step-37.scaling_size.png" alt=
"">
2428<
img src=
"https://www.dealii.org/images/steps/developer/step-37.scaling_oldnew.png" alt=
"">
2443<a name=
"Possibilitiesforextensions"></a><
h3> Possibilities
for extensions</
h3>
2470solution.reinit(dof_handler.locally_owned_dofs(),
2473solution.copy_locally_owned_data_from(
copy_vec);
2474constraints.distribute(solution);
2475solution.update_ghost_values();
2571 if (solution.locally_owned_elements().is_element(
pair.
first))
2578 constraints.distribute(solution);
2599the object @p constraints:
2605 constraints.distribute(solution);
2606 solution.update_ghost_values();
2611 for (
unsigned int cell = 0;
2612 cell < system_matrix.get_matrix_free()->n_cell_batches();
2616 phi.read_dof_values_plain(solution);
2618 for (
unsigned int q = 0;
q <
phi.n_q_points; ++
q)
2683 std::shared_ptr<MatrixFree<dim, double>> matrix_free(
2685 matrix_free->reinit(dof_handler,
2692 constraints.distribute(solution);
2699 for (
unsigned int cell = 0;
2704 for (
unsigned int q = 0;
q <
phi.n_q_points; ++
q)
2727improve the time
per unknown. (Even higher degrees typically get slower again,
2728because the multigrid iteration counts increase slightly with the chosen
2729simple smoother. One could then use hybrid multigrid algorithms to use
2730polynomial coarsening through MGTransferGlobalCoarsening, to reduce the impact
2742will then pick up this interface and schedule its vector operations),
and </li>
2748<a name="PlainProg"></a>
value_type * data() const noexcept
void reinit(value_type *starting_element, const std::size_t n_elements)
value_type get_dof_value(const unsigned int dof) const
void read_dof_values(const VectorType &src, const unsigned int first_index=0, const std::bitset< VectorizedArrayType::size()> &mask=std::bitset< VectorizedArrayType::size()>().flip())
void evaluate(const EvaluationFlags::EvaluationFlags evaluation_flag)
void set_constrained_entries_to_one(VectorType &dst) const
void vmult_add(VectorType &dst, const VectorType &src) const
void vmult_interface_down(VectorType &dst, const VectorType &src) const
std::shared_ptr< DiagonalMatrix< VectorType > > inverse_diagonal_entries
unsigned int n_cell_batches() const
void initialize_dof_vector(VectorType &vec, const unsigned int dof_handler_index=0) const
void cell_loop(const std::function< void(const MatrixFree< dim, Number, VectorizedArrayType > &, OutVector &, const InVector &, const std::pair< unsigned int, unsigned int > &)> &cell_operation, OutVector &dst, const InVector &src, const bool zero_dst_vector=false) const
#define DEAL_II_WITH_P4EST
__global__ void reduction(Number *result, const Number *v, const size_type N)
__global__ void set(Number *val, const Number s, const size_type N)
#define Assert(cond, exc)
#define AssertDimension(dim1, dim2)
void loop(ITERATOR begin, std_cxx20::type_identity_t< ITERATOR > end, DOFINFO &dinfo, INFOBOX &info, const std::function< void(DOFINFO &, typename INFOBOX::CellInfo &)> &cell_worker, const std::function< void(DOFINFO &, typename INFOBOX::CellInfo &)> &boundary_worker, const std::function< void(DOFINFO &, DOFINFO &, typename INFOBOX::CellInfo &, typename INFOBOX::CellInfo &)> &face_worker, ASSEMBLER &assembler, const LoopControl &lctrl=LoopControl())
void make_hanging_node_constraints(const DoFHandler< dim, spacedim > &dof_handler, AffineConstraints< number > &constraints)
@ update_JxW_values
Transformed quadrature weights.
@ update_gradients
Shape function gradients.
@ update_quadrature_points
Transformed quadrature points.
void apply(const Kokkos::TeamPolicy< MemorySpace::Default::kokkos_space::execution_space >::member_type &team_member, const Kokkos::View< Number *, MemorySpace::Default::kokkos_space > shape_data, const ViewTypeIn in, ViewTypeOut out)
void matrix_free_data_locality(DoFHandler< dim, spacedim > &dof_handler, const MatrixFree< dim, Number, VectorizedArrayType > &matrix_free)
@ matrix
Contents is actually a matrix.
@ diagonal
Matrix is diagonal.
@ general
No special properties.
Point< spacedim > point(const gp_Pnt &p, const double tolerance=1e-10)
SymmetricTensor< 2, dim, Number > e(const Tensor< 2, dim, Number > &F)
SymmetricTensor< 2, dim, Number > b(const Tensor< 2, dim, Number > &F)
SymmetricTensor< 2, dim, Number > d(const Tensor< 2, dim, Number > &F, const Tensor< 2, dim, Number > &dF_dt)
void call(const std::function< RT()> &function, internal::return_value< RT > &ret_val)
VectorType::value_type * end(VectorType &V)
std::vector< unsigned int > serial(const std::vector< unsigned int > &targets, const std::function< RequestType(const unsigned int)> &create_request, const std::function< AnswerType(const unsigned int, const RequestType &)> &answer_request, const std::function< void(const unsigned int, const AnswerType &)> &process_answer, const MPI_Comm comm)
T sum(const T &t, const MPI_Comm mpi_communicator)
unsigned int this_mpi_process(const MPI_Comm mpi_communicator)
T reduce(const T &local_value, const MPI_Comm comm, const std::function< T(const T &, const T &)> &combiner, const unsigned int root_process=0)
std::string compress(const std::string &input)
void run(const Iterator &begin, const std_cxx20::type_identity_t< Iterator > &end, Worker worker, Copier copier, const ScratchData &sample_scratch_data, const CopyData &sample_copy_data, const unsigned int queue_length, const unsigned int chunk_size)
unsigned int n_cells(const internal::TriangulationImplementation::NumberCache< 1 > &c)
void copy(const T *begin, const T *end, U *dest)
int(&) functions(const void *v1, const void *v2)
void assemble(const MeshWorker::DoFInfoBox< dim, DOFINFO > &dinfo, A *assembler)
void reinit(MatrixBlock< MatrixType > &v, const BlockSparsityPattern &p)
static const unsigned int invalid_unsigned_int
unsigned int global_dof_index
const ::parallel::distributed::Triangulation< dim, spacedim > * triangulation
TasksParallelScheme tasks_parallel_scheme
UpdateFlags mapping_update_flags
DEAL_II_HOST constexpr Number determinant(const SymmetricTensor< 2, dim, Number > &)
DEAL_II_HOST constexpr SymmetricTensor< 2, dim, Number > invert(const SymmetricTensor< 2, dim, Number > &)
std::array< Number, 1 > eigenvalues(const SymmetricTensor< 2, 1, Number > &T)