Reference documentation for deal.II version 9.5.1
\(\newcommand{\dealvcentcolon}{\mathrel{\mathop{:}}}\) \(\newcommand{\dealcoloneq}{\dealvcentcolon\mathrel{\mkern-1.2mu}=}\) \(\newcommand{\jump}[1]{\left[\!\left[ #1 \right]\!\right]}\) \(\newcommand{\average}[1]{\left\{\!\left\{ #1 \right\}\!\right\}}\)
Loading...
Searching...
No Matches
step-40.h
Go to the documentation of this file.
1
769 *   #endif
770 *   LA::MPI::PreconditionAMG preconditioner;
771 *   preconditioner.initialize(system_matrix, data);
772 *  
773 *   solver.solve(system_matrix,
775 *   system_rhs,
776 *   preconditioner);
777 *  
778 *   pcout << " Solved in " << solver_control.last_step() << " iterations."
779 *   << std::endl;
780 *  
781 *   constraints.distribute(completely_distributed_solution);
782 *  
784 *   }
785 *  
786 *  
787 *  
788 * @endcode
789 *
790 *
791 * <a name="LaplaceProblemrefine_grid"></a>
792 * <h4>LaplaceProblem::refine_grid</h4>
793 *
794
795 *
796 * The function that estimates the error and refines the grid is again
797 * almost exactly like the one in @ref step_6 "step-6". The only difference is that the
798 * function that flags cells to be refined is now in namespace
802 *
803
804 *
805 * Note that we didn't have to do anything special about the
806 * KellyErrorEstimator class: we just give it a vector with as many elements
807 * as the local triangulation has cells (locally owned cells, ghost cells,
808 * and artificial ones), but it only fills those entries that correspond to
809 * cells that are locally owned.
810 *
811 * @code
812 *   template <int dim>
813 *   void LaplaceProblem<dim>::refine_grid()
814 *   {
815 *   TimerOutput::Scope t(computing_timer, "refine");
816 *  
817 *   Vector<float> estimated_error_per_cell(triangulation.n_active_cells());
818 *   KellyErrorEstimator<dim>::estimate(
819 *   dof_handler,
820 *   QGauss<dim - 1>(fe.degree + 1),
821 *   std::map<types::boundary_id, const Function<dim> *>(),
822 *   locally_relevant_solution,
823 *   estimated_error_per_cell);
824 *   parallel::distributed::GridRefinement::refine_and_coarsen_fixed_number(
825 *   triangulation, estimated_error_per_cell, 0.3, 0.03);
826 *   triangulation.execute_coarsening_and_refinement();
827 *   }
828 *  
829 *  
830 *  
831 * @endcode
832 *
833 *
834 * <a name="LaplaceProblemoutput_results"></a>
835 * <h4>LaplaceProblem::output_results</h4>
836 *
837
838 *
839 * Compared to the corresponding function in @ref step_6 "step-6", the one here is
840 * a tad more complicated. There are two reasons: the first one is
841 * that we do not just want to output the solution but also for each
842 * cell which processor owns it (i.e. which "subdomain" it is
843 * in). Secondly, as discussed at length in @ref step_17 "step-17" and @ref step_18 "step-18",
844 * generating graphical data can be a bottleneck in
845 * parallelizing. In those two programs, we simply generate one
846 * output file per process. That worked because the
847 * parallel::shared::Triangulation cannot be used with large numbers
848 * of MPI processes anyway. But this doesn't scale: Creating a
850 * large number of processors.
851 *
852
853 *
855 * high-performance, parallel IO routines using MPI I/O to write to
856 * a small, fixed number of visualization files (here 8). We also
857 * generate a .pvtu record referencing these .vtu files, which can
859 *
860
861 *
862 * To start, the top of the function looks like it usually does. In addition
863 * to attaching the solution vector (the one that has entries for all locally
864 * relevant, not only the locally owned, elements), we attach a data vector
865 * that stores, for each cell, the subdomain the cell belongs to. This is
867 * cell. The vector we attach therefore has an entry for every cell that the
868 * current processor has in its mesh (locally owned ones, ghost cells, and
869 * artificial cells), but the DataOut class will ignore all entries that
871 * consequence, it doesn't actually matter what values we write into these
872 * vector entries: we simply fill the entire vector with the number of the
873 * current MPI process (i.e. the subdomain_id of the current process); this
874 * correctly sets the values we care for, i.e. the entries that correspond
875 * to locally owned cells, while providing the wrong value for all other
876 * elements -- but these are then ignored anyway.
877 *
878 * @code
879 *   template <int dim>
880 *   void LaplaceProblem<dim>::output_results(const unsigned int cycle) const
881 *   {
882 *   DataOut<dim> data_out;
883 *   data_out.attach_dof_handler(dof_handler);
884 *   data_out.add_data_vector(locally_relevant_solution, "u");
885 *  
886 *   Vector<float> subdomain(triangulation.n_active_cells());
887 *   for (unsigned int i = 0; i < subdomain.size(); ++i)
888 *   subdomain(i) = triangulation.locally_owned_subdomain();
889 *   data_out.add_data_vector(subdomain, "subdomain");
890 *  
891 *   data_out.build_patches();
892 *  
893 * @endcode
894 *
895 * The final step is to write this data to disk. We write up to 8 VTU files
896 * in parallel with the help of MPI-IO. Additionally a PVTU record is
897 * generated, which groups the written VTU files.
898 *
899 * @code
900 *   data_out.write_vtu_with_pvtu_record(
901 *   "./", "solution", cycle, mpi_communicator, 2, 8);
902 *   }
903 *  
904 *  
905 *  
906 * @endcode
907 *
908 *
909 * <a name="LaplaceProblemrun"></a>
910 * <h4>LaplaceProblem::run</h4>
911 *
912
913 *
914 * The function that controls the overall behavior of the program is again
915 * like the one in @ref step_6 "step-6". The minor difference are the use of
916 * <code>pcout</code> instead of <code>std::cout</code> for output to the
917 * console (see also @ref step_17 "step-17").
918 *
919
920 *
921 * A functional difference to @ref step_6 "step-6" is the use of a square domain and that
922 * we start with a slightly finer mesh (5 global refinement cycles) -- there
925 * starting on 1024).
926 *
927 * @code
928 *   template <int dim>
930 *   {
931 *   pcout << "Running with "
933 *   << "PETSc"
934 *   #else
935 *   << "Trilinos"
936 *   #endif
937 *   << " on " << Utilities::MPI::n_mpi_processes(mpi_communicator)
938 *   << " MPI rank(s)..." << std::endl;
939 *  
940 *   const unsigned int n_cycles = 8;
941 *   for (unsigned int cycle = 0; cycle < n_cycles; ++cycle)
942 *   {
943 *   pcout << "Cycle " << cycle << ':' << std::endl;
944 *  
945 *   if (cycle == 0)
946 *   {
948 *   triangulation.refine_global(5);
949 *   }
950 *   else
951 *   refine_grid();
952 *  
953 *   setup_system();
954 *  
955 *   pcout << " Number of active cells: "
956 *   << triangulation.n_global_active_cells() << std::endl
957 *   << " Number of degrees of freedom: " << dof_handler.n_dofs()
958 *   << std::endl;
959 *  
961 *   solve();
962 *  
963 *   {
965 *   output_results(cycle);
966 *   }
967 *  
968 *   computing_timer.print_summary();
969 *   computing_timer.reset();
970 *  
971 *   pcout << std::endl;
972 *   }
973 *   }
974 *   } // namespace Step40
975 *  
976 *  
977 *  
978 * @endcode
979 *
980 *
981 * <a name="main"></a>
982 * <h4>main()</h4>
983 *
984
985 *
986 * The final function, <code>main()</code>, again has the same structure as in
987 * all other programs, in particular @ref step_6 "step-6". Like the other programs that use
988 * MPI, we have to initialize and finalize MPI, which is done using the helper
991 * Zoltan (though the last two are not used in this tutorial). The order here
993 * initialized, so it does not make sense to do anything before creating an
995 *
996
997 *
1002 * libraries), which will delete any in-use PETSc objects. This must be done
1003 * after we destruct the Laplace solver to avoid double deletion
1004 * errors. Fortunately, due to the order of destructor call rules of C++, we
1006 * order (i.e., the reverse of the order of construction). The last function
1007 * called by Utilities::MPI::MPI_InitFinalize::~MPI_InitFinalize() is
1010 *
1011 * @code
1012 *   int main(int argc, char *argv[])
1013 *   {
1014 *   try
1015 *   {
1016 *   using namespace dealii;
1017 *   using namespace Step40;
1018 *  
1020 *  
1022 *   laplace_problem_2d.run();
1023 *   }
1024 *   catch (std::exception &exc)
1025 *   {
1026 *   std::cerr << std::endl
1027 *   << std::endl
1028 *   << "----------------------------------------------------"
1029 *   << std::endl;
1030 *   std::cerr << "Exception on processing: " << std::endl
1031 *   << exc.what() << std::endl
1032 *   << "Aborting!" << std::endl
1033 *   << "----------------------------------------------------"
1034 *   << std::endl;
1035 *  
1036 *   return 1;
1037 *   }
1038 *   catch (...)
1039 *   {
1040 *   std::cerr << std::endl
1041 *   << std::endl
1042 *   << "----------------------------------------------------"
1043 *   << std::endl;
1044 *   std::cerr << "Unknown exception!" << std::endl
1045 *   << "Aborting!" << std::endl
1046 *   << "----------------------------------------------------"
1047 *   << std::endl;
1048 *   return 1;
1049 *   }
1050 *  
1051 *   return 0;
1052 *   }
1053 * @endcode
1054<a name="Results"></a><h1>Results</h1>
1055
1056
1058installation on a few, you should get output like this:
1059@code
1060Cycle 0:
1061 Number of active cells: 1024
1062 Number of degrees of freedom: 4225
1063 Solved in 10 iterations.
1064
1065
1066+---------------------------------------------+------------+------------+
1067| Total wallclock time elapsed since start | 0.176s | |
1068| | | |
1069| Section | no. calls | wall time | % of total |
1070+---------------------------------+-----------+------------+------------+
1071| assembly | 1 | 0.0209s | 12% |
1072| output | 1 | 0.0189s | 11% |
1073| setup | 1 | 0.0299s | 17% |
1074| solve | 1 | 0.0419s | 24% |
1075+---------------------------------+-----------+------------+------------+
1076
1077
1078Cycle 1:
1079 Number of active cells: 1954
1080 Number of degrees of freedom: 8399
1081 Solved in 10 iterations.
1082
1083
1084+---------------------------------------------+------------+------------+
1085| Total wallclock time elapsed since start | 0.327s | |
1086| | | |
1087| Section | no. calls | wall time | % of total |
1088+---------------------------------+-----------+------------+------------+
1089| assembly | 1 | 0.0368s | 11% |
1090| output | 1 | 0.0208s | 6.4% |
1091| refine | 1 | 0.157s | 48% |
1092| setup | 1 | 0.0452s | 14% |
1093| solve | 1 | 0.0668s | 20% |
1094+---------------------------------+-----------+------------+------------+
1095
1096
1097Cycle 2:
1098 Number of active cells: 3664
1099 Number of degrees of freedom: 16183
1100 Solved in 11 iterations.
1101
1102...
1103@endcode
1104
1106this is due to the fact that the preconditioner depends on the
1107partitioning of the problem, the solution then differs in the last few
1112
1113When run on a sufficiently large number of machines (say a few
1115over one billion unknowns in less than a minute. On the other hand,
1119
1120<table width="100%">
1121<tr>
1122<td>
1123 <img src="https://www.dealii.org/images/steps/developer/step-40.mesh.png" alt="">
1124</td>
1125<td>
1126 <img src="https://www.dealii.org/images/steps/developer/step-40.solution.png" alt="">
1127</td>
1128</tr>
1129</table>
1130
1131The mesh on the left has a mere 7,069 cells. This is of course a
1133processor using @ref step_6 "step-6", but the point of the program was to show how
1135here are two graphs that show how the run time of a large number of parts
1139@ref distributed_paper "Distributed Computing paper"; updated graphs showing
1141more interpretation can be found in the final version of the paper):
1142
1143<table width="100%">
1144<tr>
1145<td>
1146 <img src="https://www.dealii.org/images/steps/developer/step-40.strong2.png" alt="">
1147</td>
1148<td>
1149 <img src="https://www.dealii.org/images/steps/developer/step-40.strong.png" alt="">
1150</td>
1151</tr>
1152</table>
1153
1156(For a discussion of what we consider "scalable" programs, see
1157@ref GlossParallelScaling "this glossary entry".)
1158The curves, in particular the linear solver, become a
1161processor has to solve in the above two examples is only 13,000 and 90,000
1162degrees of freedom when 4,096 processors are used; a good rule of thumb is that
1163parallel programs work well if each processor has at least 100,000 unknowns).
1164
1165While the strong scaling graphs above show that we can solve a problem of
1168be solved within a reasonable time on a machine of a particular size. We show
1169this in the following two graphs for 256 and 4096 processors:
1170
1171<table width="100%">
1172<tr>
1173<td>
1174 <img src="https://www.dealii.org/images/steps/developer/step-40.256.png" alt="">
1175</td>
1176<td>
1177 <img src="https://www.dealii.org/images/steps/developer/step-40.4096.png" alt="">
1178</td>
1179</tr>
1180</table>
1181
1183the number of degrees of freedom. This time, lines are wobbly at the left as
1184the size of local problems is too small. For more discussions of these results
1185we refer to the @ref distributed_paper "Distributed Computing paper".
1186
1187So how large are the largest problems one can solve? At the time of writing
1190multigrid method from the <a
1191href="http://acts.nersc.gov/hypre/" target="_top">Hypre package</a> as
1192a preconditioner, which unfortunately uses signed 32-bit integers to
1193index the elements of a %distributed matrix. This limits the size of
1194problems to @f$2^{31}-1=2,147,483,647@f$ degrees of freedom. From the graphs
1196number, and one could expect that given more than the 4,096 machines
1197shown above would also further reduce the compute time. That said, one
1200
1201On the other hand, this does not mean that deal.II cannot solve bigger
1202problems. Indeed, @ref step_37 "step-37" shows how one can solve problems that are not
1204here.
1205
1206
1207
1208<a name="extensions"></a>
1209<a name="Possibilitiesforextensions"></a><h3>Possibilities for extensions</h3>
1210
1211
1212In a sense, this program is the ultimate solver for the Laplace
1222this for @ref step_31 "step-31" in the @ref step_32 "step-32" tutorial program, but the same would
1223apply to, for example, @ref step_23 "step-23" and @ref step_25 "step-25" for hyperbolic time
1224dependent problems, @ref step_33 "step-33" for gas dynamics, or @ref step_35 "step-35" for the
1226
1228mentioned above, we only show pictures of the solution and the mesh
1230would produce graphical output on the order of several 10
1236solver on this processor. Implementing such an interface would allow
1239 *
1240 *
1241<a name="PlainProg"></a>
1242<h1> The plain program</h1>
1243@include "step-40.cc"
1244*/
value_type * data() const noexcept
Definition array_view.h:553
unsigned int level
Definition grid_out.cc:4618
void apply(const Kokkos::TeamPolicy< MemorySpace::Default::kokkos_space::execution_space >::member_type &team_member, const Kokkos::View< Number *, MemorySpace::Default::kokkos_space > shape_data, const ViewTypeIn in, ViewTypeOut out)
void hyper_cube(Triangulation< dim, spacedim > &tria, const double left=0., const double right=1., const bool colorize=false)
void refine(Triangulation< dim, spacedim > &tria, const Vector< Number > &criteria, const double threshold, const unsigned int max_to_mark=numbers::invalid_unsigned_int)
void coarsen(Triangulation< dim, spacedim > &tria, const Vector< Number > &criteria, const double threshold)
void scale(const double scaling_factor, Triangulation< dim, spacedim > &triangulation)
@ matrix
Contents is actually a matrix.
Point< spacedim > point(const gp_Pnt &p, const double tolerance=1e-10)
Definition utilities.cc:189
SymmetricTensor< 2, dim, Number > d(const Tensor< 2, dim, Number > &F, const Tensor< 2, dim, Number > &dF_dt)
VectorType::value_type * end(VectorType &V)
unsigned int n_mpi_processes(const MPI_Comm mpi_communicator)
Definition mpi.cc:150
T reduce(const T &local_value, const MPI_Comm comm, const std::function< T(const T &, const T &)> &combiner, const unsigned int root_process=0)
void run(const Iterator &begin, const std_cxx20::type_identity_t< Iterator > &end, Worker worker, Copier copier, const ScratchData &sample_scratch_data, const CopyData &sample_copy_data, const unsigned int queue_length, const unsigned int chunk_size)
int(&) functions(const void *v1, const void *v2)
const ::parallel::distributed::Triangulation< dim, spacedim > * triangulation