From d7bdb0fd25534c95407998da96cbd41bc5b048d2 Mon Sep 17 00:00:00 2001 From: David Schneider Date: Wed, 26 Oct 2022 13:23:47 +0200 Subject: [PATCH] Add initial working version --- .../partitioned-heat-conduction/dirichlet.prm | 6 +- .../partitioned-heat-conduction/neumann.prm | 6 +- .../precice-config.xml | 4 +- include/heat_transfer/cuda_laplace_operator.h | 489 +++++++++++++----- include/heat_transfer/heat_transfer.h | 33 +- 5 files changed, 388 insertions(+), 150 deletions(-) diff --git a/examples/partitioned-heat-conduction/dirichlet.prm b/examples/partitioned-heat-conduction/dirichlet.prm index ae31e33..298550f 100644 --- a/examples/partitioned-heat-conduction/dirichlet.prm +++ b/examples/partitioned-heat-conduction/dirichlet.prm @@ -2,10 +2,10 @@ # --------------------- subsection Finite element system # Displacement system polynomial order - set Polynomial degree = 2 + set Polynomial degree = 1 # Gauss quadrature order - set Quadrature order = 3 + set Quadrature order = 2 end @@ -14,7 +14,7 @@ subsection Geometry set Dimension = 2 # Number of global refinements - set Global refinement = 3 + set Global refinement = 4 # Testcase to compute set Testcase = partitioned_heat_dirichlet diff --git a/examples/partitioned-heat-conduction/neumann.prm b/examples/partitioned-heat-conduction/neumann.prm index eb2ffae..ec7a0b4 100644 --- a/examples/partitioned-heat-conduction/neumann.prm +++ b/examples/partitioned-heat-conduction/neumann.prm @@ -2,10 +2,10 @@ # --------------------- subsection Finite element system # Displacement system polynomial order - set Polynomial degree = 2 + set Polynomial degree = 1 # Gauss quadrature order - set Quadrature order = 3 + set Quadrature order = 2 end @@ -14,7 +14,7 @@ subsection Geometry set Dimension = 2 # Number of global refinements - set Global refinement = 3 + set Global refinement = 4 # Testcase to compute set Testcase = partitioned_heat_neumann diff --git a/examples/partitioned-heat-conduction/precice-config.xml b/examples/partitioned-heat-conduction/precice-config.xml index 4789f5d..243add5 100644 --- a/examples/partitioned-heat-conduction/precice-config.xml +++ b/examples/partitioned-heat-conduction/precice-config.xml @@ -71,8 +71,8 @@ from="Neumann" to="Dirichlet" initialize="true" /> - - + + diff --git a/include/heat_transfer/cuda_laplace_operator.h b/include/heat_transfer/cuda_laplace_operator.h index 6551018..ccae218 100644 --- a/include/heat_transfer/cuda_laplace_operator.h +++ b/include/heat_transfer/cuda_laplace_operator.h @@ -2,6 +2,12 @@ #ifdef DEAL_II_COMPILER_CUDA_AWARE +# include +# include +# include + +# include + # include # include # include @@ -10,6 +16,7 @@ # include # include +# include namespace Heat_Transfer { @@ -19,188 +26,398 @@ namespace Heat_Transfer class Coefficient; - template - class LaplaceOperator - : public MatrixFreeOperators:: - Base> + template + class VaryingCoefficientFunctor { public: - using FECellIntegrator = - FECellIntegrators>; - using FEFaceIntegrator = - FEFaceIntegrators>; - using VectorType = LinearAlgebra::distributed::Vector; + VaryingCoefficientFunctor(double *coefficient) + : coef(coefficient) + {} - LaplaceOperator(); + __device__ void + operator()( + const unsigned int cell, + const typename CUDAWrappers::MatrixFree::Data *gpu_data); - void - clear() override; - void - evaluate_coefficient(const Coefficient &coefficient_function); + static const unsigned int n_dofs_1d = fe_degree + 1; + static const unsigned int n_local_dofs = ::Utilities::pow(n_dofs_1d, dim); + static const unsigned int n_q_points = ::Utilities::pow(n_dofs_1d, dim); - void - set_delta_t(const double delta_t_) - { - delta_t = delta_t_; - } + private: + double *coef; + }; + + + template + __device__ void + VaryingCoefficientFunctor::operator()( + const unsigned int cell, + const typename CUDAWrappers::MatrixFree::Data *gpu_data) + { + const unsigned int pos = CUDAWrappers::local_q_point_id( + cell, gpu_data, n_dofs_1d, n_q_points); + + coef[pos] = 1.; + } + + template + class LaplaceOperatorQuad + { + public: + __device__ + LaplaceOperatorQuad(double coef, double delta_t) + : coef(coef) + , delta_t(delta_t) + {} - virtual void - compute_diagonal() override; + __device__ void + operator()( + CUDAWrappers::FEEvaluation + *fe_eval) const; private: - virtual void - apply_add(VectorType &dst, const VectorType &src) const override; + double coef; + // TODO: Maybe remove from this class + double delta_t; + }; + + + + template + __device__ void + LaplaceOperatorQuad::operator()( + CUDAWrappers::FEEvaluation + *fe_eval) const + { + fe_eval->submit_value(fe_eval->get_value()); + fe_eval->submit_gradient(coef * fe_eval->get_gradient() * delta_t); + } + + template + class LocalLaplaceOperator + { + public: + LocalLaplaceOperator(double *coefficient, double delta_t) + : coef(coefficient) + , delta_t(delta_t) + {} + + __device__ void + operator()( + const unsigned int cell, + const typename CUDAWrappers::MatrixFree::Data *gpu_data, + CUDAWrappers::SharedData *shared_data, + const double *src, + double *dst) const; + + static const unsigned int n_dofs_1d = fe_degree + 1; + static const unsigned int n_local_dofs = Utilities::pow(fe_degree + 1, dim); + static const unsigned int n_q_points = Utilities::pow(fe_degree + 1, dim); + + private: + double *coef; + double delta_t; + }; + + + template + __device__ void + LocalLaplaceOperator::operator()( + const unsigned int cell, + const typename CUDAWrappers::MatrixFree::Data *gpu_data, + CUDAWrappers::SharedData *shared_data, + const double *src, + double *dst) const + { + const unsigned int pos = CUDAWrappers::local_q_point_id( + cell, gpu_data, n_dofs_1d, n_q_points); + + CUDAWrappers::FEEvaluation + fe_eval(cell, gpu_data, shared_data); + + fe_eval.read_dof_values(src); + fe_eval.evaluate(true, true); + fe_eval.apply_for_each_quad_point( + LaplaceOperatorQuad(coef[pos], delta_t)); + fe_eval.integrate(true, true); + + fe_eval.distribute_local_to_global(dst); + } + + template + class CUDALaplaceOperator + { + public: + using VectorType = + LinearAlgebra::distributed::Vector; + + CUDALaplaceOperator(); + + // and initialize the coefficient + void + initialize(const DoFHandler &dof_handler, + AffineConstraints &constraints); void - local_apply(const MatrixFree & data, - VectorType & dst, - const VectorType & src, - const std::pair &cell_range) const; + set_delta_t(double dt); void - do_operation_on_cell(FECellIntegrator &phi) const; + vmult(VectorType &dst, const VectorType &src) const; - Table<2, VectorizedArray> coefficient; - double delta_t = 0; + void + initialize_dof_vector(VectorType &vec) const; + + private: + CUDAWrappers::MatrixFree mf_data; + LinearAlgebra::CUDAWrappers::Vector coef; + double delta_t; }; - template - LaplaceOperator::LaplaceOperator() - : MatrixFreeOperators::Base() + template + CUDALaplaceOperator::CUDALaplaceOperator() {} - template + template void - LaplaceOperator::clear() + CUDALaplaceOperator::initialize( + const DoFHandler &dof_handler, + AffineConstraints &constraints) { - coefficient.reinit(0, 0); - MatrixFreeOperators::Base::clear(); - } + const int fe_degree = 1; + MappingQ mapping(fe_degree); + typename CUDAWrappers::MatrixFree::AdditionalData + additional_data; + additional_data.mapping_update_flags = + (update_values | update_JxW_values | update_gradients | + update_normal_vectors | update_quadrature_points); + const QGauss<1> quad(fe_degree + 1); + mf_data.reinit(mapping, dof_handler, constraints, quad, additional_data); - template - void - LaplaceOperator::evaluate_coefficient( - const Coefficient &coefficient_function) - { - const unsigned int n_cells = this->data->n_cell_batches(); - FECellIntegrator phi(*this->data); - - coefficient.reinit(n_cells, phi.n_q_points); - for (unsigned int cell = 0; cell < n_cells; ++cell) - { - phi.reinit(cell); - for (unsigned int q = 0; q < phi.n_q_points; ++q) - coefficient(cell, q) = - coefficient_function.value(phi.quadrature_point(q), 0); - } - } + const unsigned int n_owned_cells = + dynamic_cast *>( + &dof_handler.get_triangulation()) + ->n_locally_owned_active_cells(); + coef.reinit(Utilities::pow(fe_degree + 1, dim) * n_owned_cells); - - template - void - LaplaceOperator::local_apply( - const MatrixFree & data, - VectorType & dst, - const VectorType & src, - const std::pair &cell_range) const - { - FECellIntegrator phi(data); - - for (unsigned int cell = cell_range.first; cell < cell_range.second; ++cell) - { - AssertDimension(coefficient.size(0), data.n_cell_batches()); - AssertDimension(coefficient.size(1), phi.n_q_points); - - phi.reinit(cell); - phi.read_dof_values(src); - do_operation_on_cell(phi); - phi.distribute_local_to_global(dst); - } + const VaryingCoefficientFunctor functor(coef.get_values()); + mf_data.evaluate_coefficients(functor); } - - template + template void - LaplaceOperator::apply_add( - VectorType & dst, - const VectorType &src) const + CUDALaplaceOperator::set_delta_t(double dt) { - this->data->cell_loop(&LaplaceOperator::local_apply, this, dst, src); + delta_t = dt; } - - template + template void - LaplaceOperator::compute_diagonal() + CUDALaplaceOperator::vmult(VectorType &dst, + const VectorType &src) const { - this->inverse_diagonal_entries.reset(new DiagonalMatrix()); - VectorType &inverse_diagonal = this->inverse_diagonal_entries->get_vector(); - this->data->initialize_dof_vector(inverse_diagonal); - - MatrixFreeTools::compute_diagonal(*(this->data), - inverse_diagonal, - &LaplaceOperator::do_operation_on_cell, - this); - - this->set_constrained_entries_to_one(inverse_diagonal); - - for (unsigned int i = 0; i < inverse_diagonal.locally_owned_size(); ++i) - { - Assert(inverse_diagonal.local_element(i) > 0., - ExcMessage("No diagonal entry in a positive definite operator " - "should be zero")); - inverse_diagonal.local_element(i) = - 1. / inverse_diagonal.local_element(i); - } + dst = 0.; + const int fe_degree = 1; + LocalLaplaceOperator local_operator(coef.get_values(), + delta_t); + mf_data.cell_loop(local_operator, src, dst); + // We handle here only homogeneous constraints, so the copy here can + // probably ne omitted + mf_data.copy_constrained_values(src, dst); } - - template + template void - LaplaceOperator::do_operation_on_cell( - FECellIntegrator &phi) const + CUDALaplaceOperator::initialize_dof_vector(VectorType &vec) const { - Assert(delta_t > 0, ExcNotInitialized()); - const unsigned int cell = phi.get_current_cell_index(); - phi.evaluate(EvaluationFlags::values | EvaluationFlags::gradients); - for (unsigned int q = 0; q < phi.n_q_points; ++q) - { - phi.submit_value(phi.get_value(q), q); - phi.submit_gradient(coefficient(cell, q) * delta_t * - phi.get_gradient(q), - q); - } - phi.integrate(EvaluationFlags::values | EvaluationFlags::gradients); + mf_data.initialize_dof_vector(vec); } - // Helper function in order to evaluate the vectorized point - template - VectorizedArray - evaluate_function(const Function & function, - const Point> &p_vectorized, - const unsigned int component = 0) - { - VectorizedArray result; - for (unsigned int v = 0; v < VectorizedArray::size(); ++v) - { - Point p; - for (unsigned int d = 0; d < dim; ++d) - p[d] = p_vectorized[d][v]; - result[v] = function.value(p, component); - } - return result; - } + + /** + * Partial template specialization for CUDA + */ + // template + // class LaplaceOperator + // : public MatrixFreeOperators:: + // Base> + // { + // public: + // using FECellIntegrator = + // FECellIntegrators>; + // using FEFaceIntegrator = + // FEFaceIntegrators>; + // using VectorType = + // LinearAlgebra::distributed::Vector; + + // LaplaceOperator(); + + // void + // clear() override; + + // void + // evaluate_coefficient(const Coefficient &coefficient_function); + + // void + // set_delta_t(const double delta_t_) + // { + // delta_t = delta_t_; + // } + + // virtual void + // compute_diagonal() override; + + // private: + // virtual void + // apply_add(VectorType &dst, const VectorType &src) const override; + + // void + // local_apply(const MatrixFree &data, + // VectorType &dst, + // const VectorType &src, + // const std::pair &cell_range) + // const; + + // void + // do_operation_on_cell(FECellIntegrator &phi) const; + + // Table<2, VectorizedArray> coefficient; + // LinearAlgebra::CUDAWrappers::Vector coef; + // double delta_t = 0; + // }; + + + + // template + // LaplaceOperator::LaplaceOperator() + // : MatrixFreeOperators::Base() + // {} + + + + // template + // void + // LaplaceOperator::clear() + // { + // coefficient.reinit(0, 0); + // MatrixFreeOperators::Base::clear(); + // } + + + + // template + // void + // LaplaceOperator::evaluate_coefficient( + // const Coefficient &coefficient_function) + // { + // const unsigned int n_cells = this->data->n_cell_batches(); + // FECellIntegrator phi(*this->data); + + // coefficient.reinit(n_cells, phi.n_q_points); + // for (unsigned int cell = 0; cell < n_cells; ++cell) + // { + // phi.reinit(cell); + // for (unsigned int q = 0; q < phi.n_q_points; ++q) + // coefficient(cell, q) = + // coefficient_function.value(phi.quadrature_point(q), 0); + // } + // } + + + + // template + // void + // LaplaceOperator::local_apply( + // const MatrixFree &data, + // VectorType &dst, + // const VectorType &src, + // const std::pair &cell_range) const + // { + // FECellIntegrator phi(data); + + // for (unsigned int cell = cell_range.first; cell < cell_range.second; + // ++cell) + // { + // AssertDimension(coefficient.size(0), data.n_cell_batches()); + // AssertDimension(coefficient.size(1), phi.n_q_points); + + // phi.reinit(cell); + // phi.read_dof_values(src); + // do_operation_on_cell(phi); + // phi.distribute_local_to_global(dst); + // } + // } + + + + // template + // void + // LaplaceOperator::apply_add( + // VectorType &dst, + // const VectorType &src) const + // { + // this->data->cell_loop(&LaplaceOperator::local_apply, this, dst, src); + // } + + + + // template + // void + // LaplaceOperator::compute_diagonal() + // { + // this->inverse_diagonal_entries.reset(new DiagonalMatrix()); + // VectorType &inverse_diagonal = + // this->inverse_diagonal_entries->get_vector(); + // this->data->initialize_dof_vector(inverse_diagonal); + + // MatrixFreeTools::compute_diagonal(*(this->data), + // inverse_diagonal, + // &LaplaceOperator::do_operation_on_cell, + // this); + + // this->set_constrained_entries_to_one(inverse_diagonal); + + // for (unsigned int i = 0; i < inverse_diagonal.locally_owned_size(); ++i) + // { + // Assert(inverse_diagonal.local_element(i) > 0., + // ExcMessage("No diagonal entry in a positive definite operator + // " + // "should be zero")); + // inverse_diagonal.local_element(i) = + // 1. / inverse_diagonal.local_element(i); + // } + // } + + + + // template + // void + // LaplaceOperator::do_operation_on_cell( + // FECellIntegrator &phi) const + // { + // Assert(delta_t > 0, ExcNotInitialized()); + // const unsigned int cell = phi.get_current_cell_index(); + // phi.evaluate(EvaluationFlags::values | EvaluationFlags::gradients); + // for (unsigned int q = 0; q < phi.n_q_points; ++q) + // { + // phi.submit_value(phi.get_value(q), q); + // phi.submit_gradient(coefficient(cell, q) * delta_t * + // phi.get_gradient(q), + // q); + // } + // phi.integrate(EvaluationFlags::values | EvaluationFlags::gradients); + // } } // namespace Heat_Transfer #endif \ No newline at end of file diff --git a/include/heat_transfer/heat_transfer.h b/include/heat_transfer/heat_transfer.h index 498bb45..ebc220b 100644 --- a/include/heat_transfer/heat_transfer.h +++ b/include/heat_transfer/heat_transfer.h @@ -92,6 +92,9 @@ namespace Heat_Transfer using VectorType = LinearAlgebra::distributed::Vector; using LevelVectorType = LinearAlgebra::distributed::Vector; + using DeviceVector = + LinearAlgebra::distributed::Vector; + LaplaceProblem(const Parameters::HeatParameters ¶meters); void run(std::shared_ptr> testcase_); @@ -184,13 +187,15 @@ namespace Heat_Transfer Adapter::Adapter>> precice_adapter; + std::unique_ptr> cuda_operator; + ConditionalOStream pcout; mutable TimerOutput timer; unsigned long int total_n_cg_iterations; unsigned int total_n_cg_solve; // Valid options are none, jacobi and gmg - std::string preconditioner_type = "gmg"; + std::string preconditioner_type = "none"; Time time; }; @@ -293,6 +298,10 @@ namespace Heat_Transfer system_matrix.evaluate_coefficient(Coefficient()); system_matrix.set_delta_t(time.get_delta_t()); + cuda_operator.reset(new CUDALaplaceOperator()); + cuda_operator->initialize(dof_handler, constraints); + cuda_operator->set_delta_t(time.get_delta_t()); + // ... the second matrix-free operator for inhomogenous BCs AffineConstraints no_constraints; no_constraints.close(); @@ -536,11 +545,23 @@ namespace Heat_Transfer } else if (preconditioner_type == "none") { - SolverCG cg(solver_control); - cg.solve(system_matrix, - solution_update, - system_rhs, - PreconditionIdentity()); + SolverCG cg(solver_control); + DeviceVector update, rhs; + cuda_operator->initialize_dof_vector(update); + cuda_operator->initialize_dof_vector(rhs); + update = 0; + + LinearAlgebra::ReadWriteVector rw_vector( + dof_handler.locally_owned_dofs()); + constraints.set_zero(system_rhs); + + rw_vector.import(system_rhs, VectorOperation::insert); + rhs.import(rw_vector, VectorOperation::insert); + + cg.solve(*cuda_operator.get(), update, rhs, PreconditionIdentity()); + + rw_vector.import(update, VectorOperation::insert); + solution_update.import(rw_vector, VectorOperation::insert); } else if (preconditioner_type == "gmg") {