Computes multiple global dot products in a single MPI_Allreduce.
This batched version reduces MPI latency by combining n_dots synchronizations.
| Type | Intent | Optional | Attributes | Name | ||
|---|---|---|---|---|---|---|
| type(flow_mpi_t), | intent(in) | :: | flow | |||
| integer, | intent(in) | :: | n_dots | |||
| real(kind=rk), | intent(in) | :: | a(:,:) | |||
| real(kind=rk), | intent(in) | :: | b(:,:) | |||
| real(kind=rk), | intent(out) | :: | results(:) |
subroutine flow_global_dots_owned(flow, n_dots, a, b, results) use mod_profiling, only : profiler_start, profiler_stop type(flow_mpi_t), intent(in) :: flow integer, intent(in) :: n_dots real(rk), intent(in) :: a(:,:) ! (ncells, n_dots) real(rk), intent(in) :: b(:,:) ! (ncells, n_dots) real(rk), intent(out) :: results(:) ! (n_dots) real(rk) :: local_dots(n_dots) integer :: c, i, ierr local_dots = zero do i = 1, n_dots do c = flow%first_cell, flow%last_cell local_dots(i) = local_dots(i) + a(c, i) * b(c, i) end do end do call profiler_start('MPI_Communication') call MPI_Allreduce(local_dots, results, n_dots, MPI_DOUBLE_PRECISION, MPI_SUM, flow%comm, ierr) call check_mpi(ierr, 'MPI_Allreduce batched dots') call profiler_stop('MPI_Communication') end subroutine flow_global_dots_owned