Reference documentation for deal.II version 9.4.0
\(\newcommand{\dealvcentcolon}{\mathrel{\mathop{:}}}\) \(\newcommand{\dealcoloneq}{\dealvcentcolon\mathrel{\mkern-1.2mu}=}\) \(\newcommand{\jump}[1]{\left[\!\left[ #1 \right]\!\right]}\) \(\newcommand{\average}[1]{\left\{\!\left\{ #1 \right\}\!\right\}}\)
step-48.h
Go to the documentation of this file.
1 ) const override
550  * {
551  * double t = this->get_time();
552  *
553  * const double m = 0.5;
554  * const double c1 = 0.;
555  * const double c2 = 0.;
556  * const double factor =
557  * (m / std::sqrt(1. - m * m) * std::sin(std::sqrt(1. - m * m) * t + c2));
558  * double result = 1.;
559  * for (unsigned int d = 0; d < dim; ++d)
560  * result *= -4. * std::atan(factor / std::cosh(m * p[d] + c1));
561  * return result;
562  * }
563  * };
564  *
565  *
566  *
567  * @endcode
568  *
569  *
570  * <a name="SineGordonProblemclass"></a>
571  * <h3>SineGordonProblem class</h3>
572  *
573 
574  *
575  * This is the main class that builds on the class in @ref step_25 "step-25". However, we
576  * replaced the SparseMatrix<double> class by the MatrixFree class to store
577  * the geometry data. Also, we use a distributed triangulation in this
578  * example.
579  *
580  * @code
581  * template <int dim>
582  * class SineGordonProblem
583  * {
584  * public:
585  * SineGordonProblem();
586  * void run();
587  *
588  * private:
589  * ConditionalOStream pcout;
590  *
591  * void make_grid_and_dofs();
592  * void output_results(const unsigned int timestep_number);
593  *
594  * #ifdef DEAL_II_WITH_P4EST
596  * #else
598  * #endif
599  * FE_Q<dim> fe;
600  * DoFHandler<dim> dof_handler;
601  *
602  * MappingQ1<dim> mapping;
603  *
604  * AffineConstraints<double> constraints;
605  * IndexSet locally_relevant_dofs;
606  *
607  * MatrixFree<dim, double> matrix_free_data;
608  *
609  * LinearAlgebra::distributed::Vector<double> solution, old_solution,
610  * old_old_solution;
611  *
612  * const unsigned int n_global_refinements;
613  * double time, time_step;
614  * const double final_time;
615  * const double cfl_number;
616  * const unsigned int output_timestep_skip;
617  * };
618  *
619  *
620  * @endcode
621  *
622  *
623  * <a name="SineGordonProblemSineGordonProblem"></a>
624  * <h4>SineGordonProblem::SineGordonProblem</h4>
625  *
626 
627  *
628  * This is the constructor of the SineGordonProblem class. The time interval
629  * and time step size are defined here. Moreover, we use the degree of the
630  * finite element that we defined at the top of the program to initialize a
631  * FE_Q finite element based on Gauss-Lobatto support points. These points
632  * are convenient because in conjunction with a QGaussLobatto quadrature
633  * rule of the same order they give a diagonal mass matrix without
634  * compromising accuracy too much (note that the integration is inexact,
635  * though), see also the discussion in the introduction. Note that FE_Q
636  * selects the Gauss-Lobatto nodal points by default due to their improved
637  * conditioning versus equidistant points. To make things more explicit, we
638  * state the selection of the nodal points nonetheless.
639  *
640  * @code
641  * template <int dim>
642  * SineGordonProblem<dim>::SineGordonProblem()
643  * : pcout(std::cout, Utilities::MPI::this_mpi_process(MPI_COMM_WORLD) == 0)
644  * #ifdef DEAL_II_WITH_P4EST
645  * , triangulation(MPI_COMM_WORLD)
646  * #endif
647  * , fe(QGaussLobatto<1>(fe_degree + 1))
648  * , dof_handler(triangulation)
649  * , n_global_refinements(10 - 2 * dim)
650  * , time(-10)
651  * , time_step(10.)
652  * , final_time(10.)
653  * , cfl_number(.1 / fe_degree)
654  * , output_timestep_skip(200)
655  * {}
656  *
657  * @endcode
658  *
659  *
660  * <a name="SineGordonProblemmake_grid_and_dofs"></a>
661  * <h4>SineGordonProblem::make_grid_and_dofs</h4>
662  *
663 
664  *
665  * As in @ref step_25 "step-25" this functions sets up a cube grid in <code>dim</code>
666  * dimensions of extent @f$[-15,15]@f$. We refine the mesh more in the center of
667  * the domain since the solution is concentrated there. We first refine all
668  * cells whose center is within a radius of 11, and then refine once more
669  * for a radius 6. This simple ad hoc refinement could be done better by
670  * adapting the mesh to the solution using error estimators during the time
671  * stepping as done in other example programs, and using
672  * parallel::distributed::SolutionTransfer to transfer the solution to the
673  * new mesh.
674  *
675  * @code
676  * template <int dim>
677  * void SineGordonProblem<dim>::make_grid_and_dofs()
678  * {
680  * triangulation.refine_global(n_global_refinements);
681  * {
683  * cell = triangulation.begin_active(),
684  * end_cell = triangulation.end();
685  * for (; cell != end_cell; ++cell)
686  * if (cell->is_locally_owned())
687  * if (cell->center().norm() < 11)
688  * cell->set_refine_flag();
689  * triangulation.execute_coarsening_and_refinement();
690  *
691  * cell = triangulation.begin_active();
692  * end_cell = triangulation.end();
693  * for (; cell != end_cell; ++cell)
694  * if (cell->is_locally_owned())
695  * if (cell->center().norm() < 6)
696  * cell->set_refine_flag();
697  * triangulation.execute_coarsening_and_refinement();
698  * }
699  *
700  * pcout << " Number of global active cells: "
701  * #ifdef DEAL_II_WITH_P4EST
702  * << triangulation.n_global_active_cells()
703  * #else
704  * << triangulation.n_active_cells()
705  * #endif
706  * << std::endl;
707  *
708  * dof_handler.distribute_dofs(fe);
709  *
710  * pcout << " Number of degrees of freedom: " << dof_handler.n_dofs()
711  * << std::endl;
712  *
713  *
714  * @endcode
715  *
716  * We generate hanging node constraints for ensuring continuity of the
717  * solution. As in @ref step_40 "step-40", we need to equip the constraint matrix with
718  * the IndexSet of locally relevant degrees of freedom to avoid it to
719  * consume too much memory for big problems. Next, the <code> MatrixFree
720  * </code> object for the problem is set up. Note that we specify a
721  * particular scheme for shared-memory parallelization (hence one would
722  * use multithreading for intra-node parallelism and not MPI; we here
723  * choose the standard option &mdash; if we wanted to disable shared
724  * memory parallelization even in case where there is more than one TBB
725  * thread available in the program, we would choose
727  * instead of using the default QGauss quadrature argument, we supply a
728  * QGaussLobatto quadrature formula to enable the desired
729  * behavior. Finally, three solution vectors are initialized. MatrixFree
730  * expects a particular layout of ghost indices (as it handles index
731  * access in MPI-local numbers that need to match between the vector and
732  * MatrixFree), so we just ask it to initialize the vectors to be sure the
733  * ghost exchange is properly handled.
734  *
735  * @code
736  * locally_relevant_dofs =
738  * constraints.clear();
739  * constraints.reinit(locally_relevant_dofs);
740  * DoFTools::make_hanging_node_constraints(dof_handler, constraints);
741  * constraints.close();
742  *
743  * typename MatrixFree<dim>::AdditionalData additional_data;
744  * additional_data.tasks_parallel_scheme =
746  *
747  * matrix_free_data.reinit(mapping,
748  * dof_handler,
749  * constraints,
750  * QGaussLobatto<1>(fe_degree + 1),
751  * additional_data);
752  *
753  * matrix_free_data.initialize_dof_vector(solution);
754  * old_solution.reinit(solution);
755  * old_old_solution.reinit(solution);
756  * }
757  *
758  *
759  *
760  * @endcode
761  *
762  *
763  * <a name="SineGordonProblemoutput_results"></a>
764  * <h4>SineGordonProblem::output_results</h4>
765  *
766 
767  *
768  * This function prints the norm of the solution and writes the solution
769  * vector to a file. The norm is standard (except for the fact that we need
770  * to accumulate the norms over all processors for the parallel grid which
771  * we do via the VectorTools::compute_global_error() function), and the
772  * second is similar to what we did in @ref step_40 "step-40" or @ref step_37 "step-37". Note that we can
773  * use the same vector for output as the one used during computations: The
774  * vectors in the matrix-free framework always provide full information on
775  * all locally owned cells (this is what is needed in the local evaluations,
776  * too), including ghost vector entries on these cells. This is the only
777  * data that is needed in the VectorTools::integrate_difference() function
778  * as well as in DataOut. The only action to take at this point is to make
779  * sure that the vector updates its ghost values before we read from
780  * them, and to reset ghost values once done. This is a feature present only
781  * in the LinearAlgebra::distributed::Vector class. Distributed vectors with
782  * PETSc and Trilinos, on the other hand, need to be copied to special
783  * vectors including ghost values (see the relevant section in @ref step_40 "step-40"). If
784  * we also wanted to access all degrees of freedom on ghost cells (e.g. when
785  * computing error estimators that use the jump of solution over cell
786  * boundaries), we would need more information and create a vector
787  * initialized with locally relevant dofs just as in @ref step_40 "step-40". Observe also
788  * that we need to distribute constraints for output - they are not filled
789  * during computations (rather, they are interpolated on the fly in the
790  * matrix-free method FEEvaluation::read_dof_values()).
791  *
792  * @code
793  * template <int dim>
794  * void
795  * SineGordonProblem<dim>::output_results(const unsigned int timestep_number)
796  * {
797  * constraints.distribute(solution);
798  *
799  * Vector<float> norm_per_cell(triangulation.n_active_cells());
800  * solution.update_ghost_values();
802  * dof_handler,
803  * solution,
805  * norm_per_cell,
806  * QGauss<dim>(fe_degree + 1),
808  * const double solution_norm =
810  * norm_per_cell,
812  *
813  * pcout << " Time:" << std::setw(8) << std::setprecision(3) << time
814  * << ", solution norm: " << std::setprecision(5) << std::setw(7)
815  * << solution_norm << std::endl;
816  *
817  * DataOut<dim> data_out;
818  *
819  * data_out.attach_dof_handler(dof_handler);
820  * data_out.add_data_vector(solution, "solution");
821  * data_out.build_patches(mapping);
822  *
823  * data_out.write_vtu_with_pvtu_record(
824  * "./", "solution", timestep_number, MPI_COMM_WORLD, 3);
825  *
826  * solution.zero_out_ghost_values();
827  * }
828  *
829  *
830  * @endcode
831  *
832  *
833  * <a name="SineGordonProblemrun"></a>
834  * <h4>SineGordonProblem::run</h4>
835  *
836 
837  *
838  * This function is called by the main function and steps into the
839  * subroutines of the class.
840  *
841 
842  *
843  * After printing some information about the parallel setup, the first
844  * action is to set up the grid and the cell operator. Then, the time step
845  * is computed from the CFL number given in the constructor and the finest
846  * mesh size. The finest mesh size is computed as the diameter of the last
847  * cell in the triangulation, which is the last cell on the finest level of
848  * the mesh. This is only possible for meshes where all elements on a level
849  * have the same size, otherwise, one needs to loop over all cells. Note
850  * that we need to query all the processors for their finest cell since
851  * not all processors might hold a region where the mesh is at the finest
852  * level. Then, we readjust the time step a little to hit the final time
853  * exactly.
854  *
855  * @code
856  * template <int dim>
858  * {
859  * {
860  * pcout << "Number of MPI ranks: "
861  * << Utilities::MPI::n_mpi_processes(MPI_COMM_WORLD) << std::endl;
862  * pcout << "Number of threads on each rank: "
863  * << MultithreadInfo::n_threads() << std::endl;
864  * const unsigned int n_vect_doubles = VectorizedArray<double>::size();
865  * const unsigned int n_vect_bits = 8 * sizeof(double) * n_vect_doubles;
866  * pcout << "Vectorization over " << n_vect_doubles
867  * << " doubles = " << n_vect_bits << " bits ("
869  * << std::endl
870  * << std::endl;
871  * }
872  * make_grid_and_dofs();
873  *
874  * const double local_min_cell_diameter =
875  * triangulation.last()->diameter() / std::sqrt(dim);
876  * const double global_min_cell_diameter =
877  * -Utilities::MPI::max(-local_min_cell_diameter, MPI_COMM_WORLD);
878  * time_step = cfl_number * global_min_cell_diameter;
879  * time_step = (final_time - time) / (int((final_time - time) / time_step));
880  * pcout << " Time step size: " << time_step
881  * << ", finest cell: " << global_min_cell_diameter << std::endl
882  * << std::endl;
883  *
884  * @endcode
885  *
886  * Next the initial value is set. Since we have a two-step time stepping
887  * method, we also need a value of the solution at time-time_step. For
888  * accurate results, one would need to compute this from the time
889  * derivative of the solution at initial time, but here we ignore this
890  * difficulty and just set it to the initial value function at that
891  * artificial time.
892  *
893 
894  *
895  * We then go on by writing the initial state to file and collecting
896  * the two starting solutions in a <tt>std::vector</tt> of pointers that
897  * get later consumed by the SineGordonOperation::apply() function. Next,
898  * an instance of the <code> SineGordonOperation class </code> based on
899  * the finite element degree specified at the top of this file is set up.
900  *
901  * @code
902  * VectorTools::interpolate(mapping,
903  * dof_handler,
904  * InitialCondition<dim>(1, time),
905  * solution);
906  * VectorTools::interpolate(mapping,
907  * dof_handler,
908  * InitialCondition<dim>(1, time - time_step),
909  * old_solution);
910  * output_results(0);
911  *
912  * std::vector<LinearAlgebra::distributed::Vector<double> *>
913  * previous_solutions({&old_solution, &old_old_solution});
914  *
915  * SineGordonOperation<dim, fe_degree> sine_gordon_op(matrix_free_data,
916  * time_step);
917  *
918  * @endcode
919  *
920  * Now loop over the time steps. In each iteration, we shift the solution
921  * vectors by one and call the `apply` function of the
922  * `SineGordonOperator` class. Then, we write the solution to a file. We
923  * clock the wall times for the computational time needed as wall as the
924  * time needed to create the output and report the numbers when the time
925  * stepping is finished.
926  *
927 
928  *
929  * Note how this shift is implemented: We simply call the swap method on
930  * the two vectors which swaps only some pointers without the need to copy
931  * data around, a relatively expensive operation within an explicit time
932  * stepping method. Let us see what happens in more detail: First, we
933  * exchange <code>old_solution</code> with <code>old_old_solution</code>,
934  * which means that <code>old_old_solution</code> gets
935  * <code>old_solution</code>, which is what we expect. Similarly,
936  * <code>old_solution</code> gets the content from <code>solution</code>
937  * in the next step. After this, <code>solution</code> holds
938  * <code>old_old_solution</code>, but that will be overwritten during this
939  * step.
940  *
941  * @code
942  * unsigned int timestep_number = 1;
943  *
944  * Timer timer;
945  * double wtime = 0;
946  * double output_time = 0;
947  * for (time += time_step; time <= final_time;
948  * time += time_step, ++timestep_number)
949  * {
950  * timer.restart();
951  * old_old_solution.swap(old_solution);
952  * old_solution.swap(solution);
953  * sine_gordon_op.apply(solution, previous_solutions);
954  * wtime += timer.wall_time();
955  *
956  * timer.restart();
957  * if (timestep_number % output_timestep_skip == 0)
958  * output_results(timestep_number / output_timestep_skip);
959  *
960  * output_time += timer.wall_time();
961  * }
962  * timer.restart();
963  * output_results(timestep_number / output_timestep_skip + 1);
964  * output_time += timer.wall_time();
965  *
966  * pcout << std::endl
967  * << " Performed " << timestep_number << " time steps." << std::endl;
968  *
969  * pcout << " Average wallclock time per time step: "
970  * << wtime / timestep_number << 's' << std::endl;
971  *
972  * pcout << " Spent " << output_time << "s on output and " << wtime
973  * << "s on computations." << std::endl;
974  * }
975  * } // namespace Step48
976  *
977  *
978  *
979  * @endcode
980  *
981  *
982  * <a name="Thecodemaincodefunction"></a>
983  * <h3>The <code>main</code> function</h3>
984  *
985 
986  *
987  * As in @ref step_40 "step-40", we initialize MPI at the start of the program. Since we will
988  * in general mix MPI parallelization with threads, we also set the third
989  * argument in MPI_InitFinalize that controls the number of threads to an
990  * invalid number, which means that the TBB library chooses the number of
991  * threads automatically, typically to the number of available cores in the
992  * system. As an alternative, you can also set this number manually if you
993  * want to set a specific number of threads (e.g. when MPI-only is required).
994  *
995  * @code
996  * int main(int argc, char **argv)
997  * {
998  * using namespace Step48;
999  * using namespace dealii;
1000  *
1001  * Utilities::MPI::MPI_InitFinalize mpi_initialization(
1002  * argc, argv, numbers::invalid_unsigned_int);
1003  *
1004  * try
1005  * {
1006  * SineGordonProblem<dimension> sg_problem;
1007  * sg_problem.run();
1008  * }
1009  * catch (std::exception &exc)
1010  * {
1011  * std::cerr << std::endl
1012  * << std::endl
1013  * << "----------------------------------------------------"
1014  * << std::endl;
1015  * std::cerr << "Exception on processing: " << std::endl
1016  * << exc.what() << std::endl
1017  * << "Aborting!" << std::endl
1018  * << "----------------------------------------------------"
1019  * << std::endl;
1020  *
1021  * return 1;
1022  * }
1023  * catch (...)
1024  * {
1025  * std::cerr << std::endl
1026  * << std::endl
1027  * << "----------------------------------------------------"
1028  * << std::endl;
1029  * std::cerr << "Unknown exception!" << std::endl
1030  * << "Aborting!" << std::endl
1031  * << "----------------------------------------------------"
1032  * << std::endl;
1033  * return 1;
1034  * }
1035  *
1036  * return 0;
1037  * }
1038  * @endcode
1039 <a name="Results"></a><h1>Results</h1>
1040 
1041 
1042 <a name="Comparisonwithasparsematrix"></a><h3>Comparison with a sparse matrix</h3>
1043 
1044 
1045 In order to demonstrate the gain in using the MatrixFree class instead of
1046 the standard <code>deal.II</code> assembly routines for evaluating the
1047 information from old time steps, we study a simple serial run of the code on a
1048 nonadaptive mesh. Since much time is spent on evaluating the sine function, we
1049 do not only show the numbers of the full sine-Gordon equation but also for the
1050 wave equation (the sine-term skipped from the sine-Gordon equation). We use
1051 both second and fourth order elements. The results are summarized in the
1052 following table.
1053 
1054 <table align="center" class="doxtable">
1055  <tr>
1056  <th>&nbsp;</th>
1057  <th colspan="3">wave equation</th>
1058  <th colspan="2">sine-Gordon</th>
1059  </tr>
1060  <tr>
1061  <th>&nbsp;</th>
1062  <th>MF</th>
1063  <th>SpMV</th>
1064  <th>dealii</th>
1065  <th>MF</th>
1066  <th>dealii</th>
1067  </tr>
1068  <tr>
1069  <td>2D, @f$\mathcal{Q}_2@f$</td>
1070  <td align="right"> 0.0106</td>
1071  <td align="right"> 0.00971</td>
1072  <td align="right"> 0.109</td>
1073  <td align="right"> 0.0243</td>
1074  <td align="right"> 0.124</td>
1075  </tr>
1076  <tr>
1077  <td>2D, @f$\mathcal{Q}_4@f$</td>
1078  <td align="right"> 0.0328</td>
1079  <td align="right"> 0.0706</td>
1080  <td align="right"> 0.528</td>
1081  <td align="right"> 0.0714</td>
1082  <td align="right"> 0.502</td>
1083  </tr>
1084  <tr>
1085  <td>3D, @f$\mathcal{Q}_2@f$</td>
1086  <td align="right"> 0.0151</td>
1087  <td align="right"> 0.0320</td>
1088  <td align="right"> 0.331</td>
1089  <td align="right"> 0.0376</td>
1090  <td align="right"> 0.364</td>
1091  </tr>
1092  <tr>
1093  <td>3D, @f$\mathcal{Q}_4@f$</td>
1094  <td align="right"> 0.0918</td>
1095  <td align="right"> 0.844</td>
1096  <td align="right"> 6.83</td>
1097  <td align="right"> 0.194</td>
1098  <td align="right"> 6.95</td>
1099  </tr>
1100 </table>
1101 
1102 It is apparent that the matrix-free code outperforms the standard assembly
1103 routines in deal.II by far. In 3D and for fourth order elements, one operator
1104 evaluation is also almost ten times as fast as a sparse matrix-vector
1105 product.
1106 
1107 <a name="Parallelrunin2Dand3D"></a><h3>Parallel run in 2D and 3D</h3>
1108 
1109 
1110 We start with the program output obtained on a workstation with 12 cores / 24
1111 threads (one Intel Xeon E5-2687W v4 CPU running at 3.2 GHz, hyperthreading
1112 enabled), running the program in release mode:
1113 @code
1114 \$ make run
1115 Number of MPI ranks: 1
1116 Number of threads on each rank: 24
1117 Vectorization over 4 doubles = 256 bits (AVX)
1118 
1119  Number of global active cells: 15412
1120  Number of degrees of freedom: 249065
1121  Time step size: 0.00292997, finest cell: 0.117188
1122 
1123  Time: -10, solution norm: 9.5599
1124  Time: -9.41, solution norm: 17.678
1125  Time: -8.83, solution norm: 23.504
1126  Time: -8.24, solution norm: 27.5
1127  Time: -7.66, solution norm: 29.513
1128  Time: -7.07, solution norm: 29.364
1129  Time: -6.48, solution norm: 27.23
1130  Time: -5.9, solution norm: 23.527
1131  Time: -5.31, solution norm: 18.439
1132  Time: -4.73, solution norm: 11.935
1133  Time: -4.14, solution norm: 5.5284
1134  Time: -3.55, solution norm: 8.0354
1135  Time: -2.97, solution norm: 14.707
1136  Time: -2.38, solution norm: 20
1137  Time: -1.8, solution norm: 22.834
1138  Time: -1.21, solution norm: 22.771
1139  Time: -0.624, solution norm: 20.488
1140  Time: -0.0381, solution norm: 16.697
1141  Time: 0.548, solution norm: 11.221
1142  Time: 1.13, solution norm: 5.3912
1143  Time: 1.72, solution norm: 8.4528
1144  Time: 2.31, solution norm: 14.335
1145  Time: 2.89, solution norm: 18.555
1146  Time: 3.48, solution norm: 20.894
1147  Time: 4.06, solution norm: 21.305
1148  Time: 4.65, solution norm: 19.903
1149  Time: 5.24, solution norm: 16.864
1150  Time: 5.82, solution norm: 12.223
1151  Time: 6.41, solution norm: 6.758
1152  Time: 6.99, solution norm: 7.2423
1153  Time: 7.58, solution norm: 12.888
1154  Time: 8.17, solution norm: 17.273
1155  Time: 8.75, solution norm: 19.654
1156  Time: 9.34, solution norm: 19.838
1157  Time: 9.92, solution norm: 17.964
1158  Time: 10, solution norm: 17.595
1159 
1160  Performed 6826 time steps.
1161  Average wallclock time per time step: 0.0013453s
1162  Spent 14.976s on output and 9.1831s on computations.
1163 @endcode
1164 
1165 In 3D, the respective output looks like
1166 @code
1167 \$ make run
1168 Number of MPI ranks: 1
1169 Number of threads on each rank: 24
1170 Vectorization over 4 doubles = 256 bits (AVX)
1171 
1172  Number of global active cells: 17592
1173  Number of degrees of freedom: 1193881
1174  Time step size: 0.0117233, finest cell: 0.46875
1175 
1176  Time: -10, solution norm: 29.558
1177  Time: -7.66, solution norm: 129.13
1178  Time: -5.31, solution norm: 67.753
1179  Time: -2.97, solution norm: 79.245
1180  Time: -0.621, solution norm: 123.52
1181  Time: 1.72, solution norm: 43.525
1182  Time: 4.07, solution norm: 93.285
1183  Time: 6.41, solution norm: 97.722
1184  Time: 8.76, solution norm: 36.734
1185  Time: 10, solution norm: 94.115
1186 
1187  Performed 1706 time steps.
1188  Average wallclock time per time step: 0.0084542s
1189  Spent 16.766s on output and 14.423s on computations.
1190 @endcode
1191 
1192 It takes 0.008 seconds for one time step with more than a million
1193 degrees of freedom (note that we would need many processors to reach such
1194 numbers when solving linear systems).
1195 
1196 If we replace the thread-parallelization by a pure MPI parallelization, the
1197 timings change into:
1198 @code
1199 \$ mpirun -n 24 ./step-48
1200 Number of MPI ranks: 24
1201 Number of threads on each rank: 1
1202 Vectorization over 4 doubles = 256 bits (AVX)
1203 ...
1204  Performed 1706 time steps.
1205  Average wallclock time per time step: 0.0051747s
1206  Spent 2.0535s on output and 8.828s on computations.
1207 @endcode
1208 
1209 We observe a dramatic speedup for the output (which makes sense, given that
1210 most code of the output is not parallelized via threads, whereas it is for
1211 MPI), but less than the theoretical factor of 12 we would expect from the
1212 parallelism. More interestingly, the computations also get faster when
1213 switching from the threads-only variant to the MPI-only variant. This is a
1214 general observation for the MatrixFree framework (as of updating this data in
1215 2019). The main reason is that the decisions regarding work on conflicting
1216 cell batches made to enable execution in parallel are overly pessimistic:
1217 While they ensure that no work on neighboring cells is done on different
1218 threads at the same time, this conservative setting implies that data from
1219 neighboring cells is also evicted from caches by the time neighbors get
1220 touched. Furthermore, the current scheme is not able to provide a constant
1221 load for all 24 threads for the given mesh with 17,592 cells.
1222 
1223 The current program allows to also mix MPI parallelization with thread
1224 parallelization. This is most beneficial when running programs on clusters
1225 with multiple nodes, using MPI for the inter-node parallelization and threads
1226 for the intra-node parallelization. On the workstation used above, we can run
1227 threads in the hyperthreading region (i.e., using 2 threads for each of the 12
1228 MPI ranks). An important setting for mixing MPI with threads is to ensure
1229 proper binning of tasks to CPUs. On many clusters the placing is either
1230 automatically via the `mpirun/mpiexec` environment, or there can be manual
1231 settings. Here, we simply report the run times the plain version of the
1232 program (noting that things could be improved towards the timings of the
1233 MPI-only program when proper pinning is done):
1234 @code
1235 \$ mpirun -n 12 ./step-48
1236 Number of MPI ranks: 12
1237 Number of threads on each rank: 2
1238 Vectorization over 4 doubles = 256 bits (AVX)
1239 ...
1240  Performed 1706 time steps.
1241  Average wallclock time per time step: 0.0056651s
1242  Spent 2.5175s on output and 9.6646s on computations.
1243 @endcode
1244 
1245 
1246 
1247 <a name="Possibilitiesforextensions"></a><h3>Possibilities for extensions</h3>
1248 
1249 
1250 There are several things in this program that could be improved to make it
1251 even more efficient (besides improved boundary conditions and physical
1252 stuff as discussed in @ref step_25 "step-25"):
1253 
1254 <ul> <li> <b>Faster evaluation of sine terms:</b> As becomes obvious
1255  from the comparison of the plain wave equation and the sine-Gordon
1256  equation above, the evaluation of the sine terms dominates the total
1257  time for the finite element operator application. There are a few
1258  reasons for this: Firstly, the deal.II sine computation of a
1259  VectorizedArray field is not vectorized (as opposed to the rest of
1260  the operator application). This could be cured by handing the sine
1261  computation to a library with vectorized sine computations like
1262  Intel's math kernel library (MKL). By using the function
1263  <code>vdSin</code> in MKL, the program uses half the computing time
1264  in 2D and 40 percent less time in 3D. On the other hand, the sine
1265  computation is structurally much more complicated than the simple
1266  arithmetic operations like additions and multiplications in the rest
1267  of the local operation.
1268 
1269  <li> <b>Higher order time stepping:</b> While the implementation allows for
1270  arbitrary order in the spatial part (by adjusting the degree of the finite
1271  element), the time stepping scheme is a standard second-order leap-frog
1272  scheme. Since solutions in wave propagation problems are usually very
1273  smooth, the error is likely dominated by the time stepping part. Of course,
1274  this could be cured by using smaller time steps (at a fixed spatial
1275  resolution), but it would be more efficient to use higher order time
1276  stepping as well. While it would be straight-forward to do so for a
1277  first-order system (use some Runge&ndash;Kutta scheme of higher order,
1278  probably combined with adaptive time step selection like the <a
1279  href="http://en.wikipedia.org/wiki/Dormand%E2%80%93Prince_method">Dormand&ndash;Prince
1280  method</a>), it is more challenging for the second-order formulation. At
1281  least in the finite difference community, people usually use the PDE to find
1282  spatial correction terms that improve the temporal error.
1283 
1284 </ul>
1285  *
1286  *
1287 <a name="PlainProg"></a>
1288 <h1> The plain program</h1>
1289 @include "step-48.cc"
1290 */
void swap(BlockIndices &u, BlockIndices &v)
void attach_dof_handler(const DoFHandler< dim, spacedim > &)
void add_data_vector(const VectorType &data, const std::vector< std::string > &names, const DataVectorType type=type_automatic, const std::vector< DataComponentInterpretation::DataComponentInterpretation > &data_component_interpretation={})
virtual void build_patches(const unsigned int n_subdivisions=0)
Definition: data_out.cc:1064
Definition: fe_q.h:549
void reinit(const MappingType &mapping, const DoFHandler< dim > &dof_handler, const AffineConstraints< number2 > &constraint, const QuadratureType &quad, const AdditionalData &additional_data=AdditionalData())
static unsigned int n_threads()
Definition: timer.h:119
double wall_time() const
Definition: timer.cc:264
void restart()
Definition: timer.h:898
Definition: vector.h:109
#define DEAL_II_WITH_P4EST
Definition: config.h:57
Point< 3 > center
Point< 2 > second
Definition: grid_out.cc:4604
Point< 2 > first
Definition: grid_out.cc:4603
unsigned int level
Definition: grid_out.cc:4606
__global__ void set(Number *val, const Number s, const size_type N)
std::string write_vtu_with_pvtu_record(const std::string &directory, const std::string &filename_without_extension, const unsigned int counter, const MPI_Comm &mpi_communicator, const unsigned int n_digits_for_counter=numbers::invalid_unsigned_int, const unsigned int n_groups=0) const
void loop(ITERATOR begin, typename identity< ITERATOR >::type end, DOFINFO &dinfo, INFOBOX &info, const std::function< void(DOFINFO &, typename INFOBOX::CellInfo &)> &cell_worker, const std::function< void(DOFINFO &, typename INFOBOX::CellInfo &)> &boundary_worker, const std::function< void(DOFINFO &, DOFINFO &, typename INFOBOX::CellInfo &, typename INFOBOX::CellInfo &)> &face_worker, ASSEMBLER &assembler, const LoopControl &lctrl=LoopControl())
Definition: loop.h:439
void make_hanging_node_constraints(const DoFHandler< dim, spacedim > &dof_handler, AffineConstraints< number > &constraints)
const Event initial
Definition: event.cc:65
Expression cosh(const Expression &x)
Expression atan(const Expression &x)
IndexSet extract_locally_relevant_dofs(const DoFHandler< dim, spacedim > &dof_handler)
Definition: dof_tools.cc:1144
void interpolate(const DoFHandler< dim, spacedim > &dof1, const InVector &u1, const DoFHandler< dim, spacedim > &dof2, OutVector &u2)
void hyper_cube(Triangulation< dim, spacedim > &tria, const double left=0., const double right=1., const bool colorize=false)
void refine(Triangulation< dim, spacedim > &tria, const Vector< Number > &criteria, const double threshold, const unsigned int max_to_mark=numbers::invalid_unsigned_int)
void shift(const Tensor< 1, spacedim > &shift_vector, Triangulation< dim, spacedim > &triangulation)
Definition: grid_tools.cc:2050
double diameter(const Triangulation< dim, spacedim > &tria)
Definition: grid_tools.cc:83
@ matrix
Contents is actually a matrix.
@ diagonal
Matrix is diagonal.
@ general
No special properties.
static const types::blas_int one
double norm(const FEValuesBase< dim > &fe, const ArrayView< const std::vector< Tensor< 1, dim >>> &Du)
Definition: divergence.h:472
Point< spacedim > point(const gp_Pnt &p, const double tolerance=1e-10)
Definition: utilities.cc:190
SymmetricTensor< 2, dim, Number > d(const Tensor< 2, dim, Number > &F, const Tensor< 2, dim, Number > &dF_dt)
SymmetricTensor< 2, dim, Number > e(const Tensor< 2, dim, Number > &F)
SymmetricTensor< 2, dim, Number > b(const Tensor< 2, dim, Number > &F)
void call(const std::function< RT()> &function, internal::return_value< RT > &ret_val)
void free(T *&pointer)
Definition: cuda.h:97
std::vector< unsigned int > serial(const std::vector< unsigned int > &targets, const std::function< RequestType(const unsigned int)> &create_request, const std::function< AnswerType(const unsigned int, const RequestType &)> &answer_request, const std::function< void(const unsigned int, const AnswerType &)> &process_answer, const MPI_Comm &comm)
unsigned int this_mpi_process(const MPI_Comm &mpi_communicator)
Definition: mpi.cc:151
unsigned int n_mpi_processes(const MPI_Comm &mpi_communicator)
Definition: mpi.cc:140
T max(const T &t, const MPI_Comm &mpi_communicator)
std::string get_time()
Definition: utilities.cc:1016
const std::string get_current_vectorization_level()
Definition: utilities.cc:941
double compute_global_error(const Triangulation< dim, spacedim > &tria, const InVector &cellwise_error, const NormType &norm, const double exponent=2.)
void integrate_difference(const Mapping< dim, spacedim > &mapping, const DoFHandler< dim, spacedim > &dof, const InVector &fe_function, const Function< spacedim, typename InVector::value_type > &exact_solution, OutVector &difference, const Quadrature< dim > &q, const NormType &norm, const Function< spacedim, double > *weight=nullptr, const double exponent=2.)
void run(const Iterator &begin, const typename identity< Iterator >::type &end, Worker worker, Copier copier, const ScratchData &sample_scratch_data, const CopyData &sample_copy_data, const unsigned int queue_length, const unsigned int chunk_size)
Definition: work_stream.h:474
void copy(const T *begin, const T *end, U *dest)
int(&) functions(const void *v1, const void *v2)
static const unsigned int invalid_unsigned_int
Definition: types.h:201
::VectorizedArray< Number, width > sin(const ::VectorizedArray< Number, width > &)
::VectorizedArray< Number, width > sqrt(const ::VectorizedArray< Number, width > &)
const ::parallel::distributed::Triangulation< dim, spacedim > * triangulation
TasksParallelScheme tasks_parallel_scheme
Definition: matrix_free.h:343
const TriangulationDescription::Settings settings