My Project
BILU0.hpp
1 /*
2  Copyright 2019 Equinor ASA
3 
4  This file is part of the Open Porous Media project (OPM).
5 
6  OPM is free software: you can redistribute it and/or modify
7  it under the terms of the GNU General Public License as published by
8  the Free Software Foundation, either version 3 of the License, or
9  (at your option) any later version.
10 
11  OPM is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with OPM. If not, see <http://www.gnu.org/licenses/>.
18 */
19 
20 #ifndef BILU0_HPP
21 #define BILU0_HPP
22 
23 #include <mutex>
24 
25 #include <opm/simulators/linalg/bda/BlockedMatrix.hpp>
26 #include <opm/simulators/linalg/bda/ILUReorder.hpp>
27 
28 #include <opm/simulators/linalg/bda/opencl.hpp>
29 #include <opm/simulators/linalg/bda/openclKernels.hpp>
30 #include <opm/simulators/linalg/bda/ChowPatelIlu.hpp>
31 
32 // if CHOW_PATEL is 0, exact ILU decomposition is performed on CPU
33 // if CHOW_PATEL is 1, iterative ILU decomposition (FGPILU) is done, as described in:
34 // FINE-GRAINED PARALLEL INCOMPLETE LU FACTORIZATION, E. Chow and A. Patel, SIAM 2015, https://doi.org/10.1137/140968896
35 // if CHOW_PATEL_GPU is 0, the decomposition is done on CPU
36 // if CHOW_PATEL_GPU is 1, the decomposition is done by bda::FGPILU::decomposition() on GPU
37 // the apply phase of the ChowPatelIlu uses two triangular matrices: L and U
38 // the exact decomposition uses a full matrix LU which is the superposition of L and U
39 // ChowPatelIlu could also operate on a full matrix LU when L and U are merged, but it is generally better to keep them split
40 #define CHOW_PATEL 0
41 #define CHOW_PATEL_GPU 1
42 
43 
44 namespace bda
45 {
46 
49  template <unsigned int block_size>
50  class BILU0
51  {
52 
53  private:
54  int N; // number of rows of the matrix
55  int Nb; // number of blockrows of the matrix
56  int nnz; // number of nonzeroes of the matrix (scalar)
57  int nnzbs; // number of blocks of the matrix
58  std::unique_ptr<BlockedMatrix<block_size> > LUmat = nullptr;
59  std::shared_ptr<BlockedMatrix<block_size> > rmat = nullptr; // only used with PAR_SIM
60 #if CHOW_PATEL
61  std::unique_ptr<BlockedMatrix<block_size> > Lmat = nullptr, Umat = nullptr;
62 #endif
63  double *invDiagVals = nullptr;
64  std::vector<int> diagIndex;
65  std::vector<int> rowsPerColor; // color i contains rowsPerColor[i] rows, which are processed in parallel
66  std::vector<int> rowsPerColorPrefix; // the prefix sum of rowsPerColor
67  std::vector<int> toOrder, fromOrder;
68  int numColors;
69  int verbosity;
70  std::once_flag pattern_uploaded;
71 
72  ILUReorder opencl_ilu_reorder;
73 
74  typedef struct {
75  cl::Buffer invDiagVals;
76  cl::Buffer diagIndex;
77  cl::Buffer rowsPerColor;
78 #if CHOW_PATEL
79  cl::Buffer Lvals, Lcols, Lrows;
80  cl::Buffer Uvals, Ucols, Urows;
81 #else
82  cl::Buffer LUvals, LUcols, LUrows;
83 #endif
84  } GPU_storage;
85 
86  ilu_apply1_kernel_type *ILU_apply1;
87  ilu_apply2_kernel_type *ILU_apply2;
88  cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale;
89  cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
90  cl::Buffer&, cl::Buffer&,
91  const int, cl::LocalSpaceArg> *ilu_decomp;
92 
93  GPU_storage s;
94  cl::Context *context;
95  cl::CommandQueue *queue;
96  std::vector<cl::Event> events;
97  cl_int err;
98  int work_group_size = 0;
99  int total_work_items = 0;
100  int lmem_per_work_group = 0;
101 
102  ChowPatelIlu chowPatelIlu;
103 
104  void chow_patel_decomposition();
105 
106  public:
107 
108  BILU0(ILUReorder opencl_ilu_reorder, int verbosity);
109 
110  ~BILU0();
111 
112  // analysis
113  bool init(BlockedMatrix<block_size> *mat);
114 
115  // ilu_decomposition
116  bool create_preconditioner(BlockedMatrix<block_size> *mat);
117 
118  // apply preconditioner, x = prec(y)
119  void apply(cl::Buffer& y, cl::Buffer& x);
120 
121  void setOpenCLContext(cl::Context *context);
122  void setOpenCLQueue(cl::CommandQueue *queue);
123  void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
124  void setKernels(
125  cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
126  cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2,
127  cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale,
128  cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp
129  );
130 
131  int* getToOrder()
132  {
133  return toOrder.data();
134  }
135 
136  int* getFromOrder()
137  {
138  return fromOrder.data();
139  }
140 
141  BlockedMatrix<block_size>* getRMat()
142  {
143  return rmat.get();
144  }
145 
146  };
147 
148 } // end namespace bda
149 
150 #endif
151 
This class implementa a Blocked ILU0 preconditioner The decomposition is done on CPU,...
Definition: BILU0.hpp:51
This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
Definition: BlockedMatrix.hpp:36
Definition: ChowPatelIlu.hpp:37