mympi.F90 Source File


Source Code

!-----------------------------------------------------------------------------------------!
! Copyright (c) 2018 Peter Grünberg Institut, Forschungszentrum Jülich, Germany           !
! This file is part of Jülich KKR code and available as free software under the conditions!
! of the MIT license as expressed in the LICENSE.md file in more detail.                  !
!-----------------------------------------------------------------------------------------!

!------------------------------------------------------------------------------------
!> Summary: Wrapper for the definition of the MPI helper functions
!> Author: 
!> Wrapper for the definition of the MPI helper functions
!------------------------------------------------------------------------------------
!> @note ruess: taken from Pkkr_sidebranch2D_2014_12_16, created by Bernd Zimmermann
!> @endnote
!------------------------------------------------------------------------------------
module mod_mympi

  implicit none

  private
  public :: myrank, nranks, master, mympi_init, mpiatom, mpiadapt, distribute_work_atoms, distribute_work_energies
#ifdef CPP_MPI
  public :: distribute_linear_on_tasks, find_dims_2d, create_newcomms_group_ie, mympi_main1c_comm, mympi_main1c_comm_newsosol, mympi_main1c_comm_newsosol2, &
    check_communication_pattern, bcast_global_variables
#endif

  integer, save :: myrank   = -1
  integer, save :: nranks   = -1
  integer, save :: master   = -1
  logical, save :: mpiatom  = .false.
  integer, save :: mpiadapt = -1

contains

  !-------------------------------------------------------------------------------
  !> Summary: Initialization of the MPI process
  !> Author: 
  !> Category: communication, KKRhost 
  !> Deprecated: False
  !> Initialization of the MPI process
  !-------------------------------------------------------------------------------
  subroutine mympi_init()

#ifdef CPP_MPI
    use :: mpi

    integer :: ierr
#endif

    master = 0

#ifdef CPP_MPI
    call mpi_comm_rank(mpi_comm_world, myrank, ierr)
    call mpi_comm_size(mpi_comm_world, nranks, ierr)
#else
    myrank = master
    nranks = 1
#endif

  end subroutine mympi_init

#ifdef CPP_MPI
  !-------------------------------------------------------------------------------
  !> Summary: Distributes 'ntot' points on 'nranks' tasks and returns the number of points per task, `ntot_pT`, and the offsets of the tasks, `ioff_pT`.
  !> Author: 
  !> Category: communication, KKRhost 
  !> Deprecated: False
  !> Distributes 'ntot' points on 'nranks' tasks and returns the number of points 
  !> per task, `ntot_pT`, and the offsets of the tasks, `ioff_pT`.
  !-------------------------------------------------------------------------------
  subroutine distribute_linear_on_tasks(nranks,myrank,master,ntot,ntot_pt,ioff_pt,  &
    output,fill_rest)

    implicit none

    integer, intent (in) :: nranks, myrank, master, ntot
    logical, intent (in) :: output
    logical, intent (in), optional :: fill_rest
    integer, dimension(0:nranks-1), intent (out) :: ntot_pt, ioff_pt

    integer :: irest, irank

    ntot_pt = int(ntot/nranks)
    ioff_pt = int(ntot/nranks)* [ (irank,irank=0,nranks-1) ]
    irest = ntot - int(ntot/nranks)*nranks

    if (irest>0) then
      do irank = 0, irest - 1
        ntot_pt(irank) = ntot_pt(irank) + 1
        ioff_pt(irank) = ioff_pt(irank) + irank
      end do                       ! irank
      do irank = irest, nranks - 1
        ioff_pt(irank) = ioff_pt(irank) + irest
      end do                       ! irank
    end if                         ! irest>0

    if (present(fill_rest)) then
      if (fill_rest) then
        if (ntot<nranks) then
          ! write(*,*) 'set rest',myrank,ntot,nranks
          do irank = ntot, nranks - 1
            ioff_pt(irank) = ioff_pt(irank-1)
            ntot_pt(irank) = ntot_pt(irank-1)
          end do                   ! irank
        end if                     ! ntot<nranks
      end if                       ! fill_rest
    end if                         ! present(fill_rest)

    if (myrank==master .and. output) then
      write (1337, *) '==== DISTRIBUTION OF POINTS ON TASKS: ===='
      do irank = 0, nranks - 1
        write (1337, '("Task ",I0," treats points ",I0," to ",I0,",#of points= ",I0)') irank, ioff_pt(irank) + 1, ioff_pt(irank) + ntot_pt(irank), ntot_pt(irank)
      end do                       ! irank
      write (1337, *) '=========================================='
    end if                         ! myrank==master

  end subroutine distribute_linear_on_tasks
#endif


#ifdef CPP_MPI
  !-------------------------------------------------------------------------------
  !> Summary: Find dimensions to create cartesian communicator
  !> Author: 
  !> Category: communication, KKRhost 
  !> Deprecated: False
  !> Find dimensions to create cartesian communicator
  !-------------------------------------------------------------------------------
  subroutine find_dims_2d(nranks, ntot1, ntot2, dims, mpiatom)
    ! find dimensions to create cartesian communicator
    ! input:  nranks, ntot1 is N_atom, ntot2 is N_E
    ! output: dims(2), dims(1) is N_atomranks, dims(2) is N_Eranks
    use :: mpi

    implicit none
    integer, intent (in) :: nranks, ntot1, ntot2
    logical, intent (in) :: mpiatom
    integer, dimension(2), intent (out) :: dims
    integer :: ierr

    if (.not. mpiatom) then
      if (nranks<=ntot2) then
        dims(1) = 1
        dims(2) = nranks
      else
        dims(1) = nranks/ntot2
        dims(2) = ntot2
      end if
    else
      if (nranks<=ntot1) then
        dims(2) = 1
        dims(1) = nranks
      else
        dims(2) = nranks/ntot1
        dims(1) = ntot1
      end if
    end if

    if (nranks>(ntot1*(ntot2+1)-1)) then
      if (myrank==master) write (*, '(A,I3,A,I3,A,I5)') 'Error for', ntot1, ' atoms and', ntot2, ' energy points you use too many processors. Nranks=', nranks
      call mpi_barrier(mpi_comm_world, ierr)
      call mpi_finalize(ierr)
      stop 'Error: too many ranks'
    end if

  end subroutine find_dims_2d
#endif


#ifdef CPP_MPI
  !-------------------------------------------------------------------------------
  !> Summary: Takes vector `kmesh` with mesh/timing information and finds number of rest procs that are devided in fractions given in ktake for optimal division of work
  !> Author: 
  !> Category: communication, KKRhost 
  !> Deprecated: False
  !> Takes vector `kmesh` with mesh/timing information and finds number of rest 
  !> procs that are devided in fractions given in ktake for optimal division of work
  !-------------------------------------------------------------------------------
  subroutine create_newcomms_group_ie(master,nranks,myrank,nat,ne,nkmesh,kmesh,     &
    mympi_comm_ie,myrank_ie,nranks_ie,mympi_comm_at,myrank_at,nranks_at,            &
    myrank_atcomm,nranks_atcomm)

    use :: mpi
    use :: mod_datatypes, only: dp
    implicit none

    integer, intent (in) :: master, nranks, myrank, ne, nat, nkmesh
    integer, dimension(nkmesh), intent (in) :: kmesh
    integer, intent (out) :: mympi_comm_ie, mympi_comm_at, nranks_atcomm, nranks_at
    integer, intent (out) :: nranks_ie, myrank_ie, myrank_at, myrank_atcomm

    integer :: rest,myg, ie, iat, ierr, ik, i1, i2, i3
    integer, dimension(nkmesh-1)  :: k
    integer, dimension(nkmesh)    :: ktake
    real (kind=dp) :: q, qmin
    real (kind=dp), dimension(nkmesh-1) :: f
    integer, dimension(:), allocatable :: mygroup
    integer, dimension(:,:), allocatable :: groups
    integer :: mympi_group_ie, mympi_group_world, mympi_comm_grid

    rest = 0
    qmin = -1
    if (myrank==master) write (1337, *) 'create_newcomms_group_ie input:', nranks, ne, nat

    ktake(:) = 0

    ! now check different cases:
    ! 1: mor than one energy group with rest ranks (tackle load imbalance)
    ! 2: no rest ranks (use cartesian grid)
    ! 3: only atom parallelization without rest ranks
    ! 4: only atom paralelization with rest ranks (discard rest ranks and use cartesian grid)
    ! 5: only energy parallelization without rest ranks

    if ((ne*nat)<nranks .and. (ne>1)) then ! .and. nat>1)) then
      ! find parallelization with rest ranks divided over energy group with
      ! denset k-mesh

      if (nkmesh<=1) then
        if (myrank==master) write (*, '(A,2I7)') &
          'no load imbalance found (all energy points have the same k-mesh), please use regular grid to not waste any resources.#E,#atoms = ', ne, nat
      end if

      rest = nranks - int(nranks/(ne*nat))*ne*nat
      if (myrank==master) write (1337, *) 'rest:', rest, ne, nat, nranks
      if (myrank==master) write (1337, *) 'kmesh:', kmesh

      if (rest>0 .and. nkmesh>1) then
        ! find fraction of k:l:m
        do ik = 1, nkmesh - 1
          k(ik) = int(real(kmesh(1))/real(kmesh(nkmesh-ik+1))-1.)
          if (k(ik)==0) k(ik) = 1
        end do

        do ik = 1, nkmesh - 2
          f(ik) = real(k(ik+1))/real(k(ik))
        end do
        f(nkmesh-1) = real(k(nkmesh-1))/real(k(1))

        if (myrank==master) write (1337, *) 'set k,i:', k, 'f', f

        ! brute force look for optimal division of rest ranks after N_E*N_at are already
        ! assigned to rectangular part of processor matrix:
        ! N_E=8
        ! --------------->
        ! ^ ( | | | | | | | )    example for 49 processors,
        ! | ( | | | | | | | )    devided according to:
        ! N_at=5  | ( | | | | | | | )         N_E = 8, N_at = 5
        ! | ( | | | | | | | )        rest = 9 = 5+3+1
        ! v ( | | | | | | | )                   k+l+m
        ! ^    ( | | ) m=1  ^
        ! l=3 |      ( | )      |
        ! v      ( | )      | k=5
        ! ( )      |
        ! ( )      v

        if (nkmesh==4) then
          ktake(:) = 0
          do i1 = 1, rest
            do i2 = 0, rest - i1
              i3 = rest - i1 - i2
              if (i1>=i2 .and. i2>=i3) then
                if (i3==0 .and. i2==0) then
                  q = sqrt((f(1)-real(i2)/real(i1))**2+(f(2)-1.)**2+(f(3)-real(i3)/real(i1))**2)
                else
                  q = sqrt((f(1)-real(i2)/real(i1))**2+(f(2)-real(i3)/real(i2))**2+(f(3)-real(i3)/real(i1))**2)
                end if
                if (q<qmin .or. qmin==-1) then
                  ktake(1:3) = [ i1, i2, i3 ]
                  qmin = q
                end if
              end if
            end do
          end do
        else if (nkmesh==3) then
          ktake(:) = 0
          do i1 = 1, rest
            i2 = rest - i1
            if (i1>=i2) then
              q = sqrt((f(1)-real(i2)/real(i1))**2)
              if (q<qmin .or. qmin==-1) then
                ktake(1:2) = [ i1, i2 ]
                qmin = q
              end if
            end if
          end do
        else if (nkmesh==2) then
          ktake(1) = rest
        else
          stop 'ERROR: nkmesh>4 not implemented yet'
        end if

        ! special case when only one additional rank
        if (rest==1) ktake(1) = rest

        if (myrank==master) write (1337, *) 'found ktake', ktake, 'with', qmin

      else if (rest>0) then
        ktake(1) = rest
      end if                       ! if(rest>0 .and. nkmesh>1)

      ! find processor groups according to non-uniform division
      allocate (groups(ne+1,2))
      groups(:, :) = -1

      do ie = 1, ne + 1
        if (ie==ne-2 .and. nkmesh>3) then
          groups(ie, 1) = nat + ktake(3)
        else if (ie==ne-1 .and. nkmesh>2) then
          groups(ie, 1) = nat + ktake(2)
        else if (ie==ne-0 .and. nkmesh>=1) then
          groups(ie, 1) = nat + ktake(1)
        else
          groups(ie, 1) = nat
        end if
        if (ie==1) then
          groups(ie, 2) = 0
        else
          groups(ie, 2) = groups(ie-1, 1) + groups(ie-1, 2)
        end if
      end do

      if (myrank==master) write (1337, *) 'groups(1:Ne), number of ranks:', groups(1:ne, 1)
      if (myrank==master) write (1337, *) 'groups(1:Ne), ie offset:', groups(1:ne, 2)

      ! find my group
      myg = -1
      do ie = 1, ne
        do iat = groups(ie, 2), groups(ie+1, 2) - 1
          if (myrank==iat) myg = ie
        end do
      end do

      if (myg==-1) then
        write (1337, *) 'no group found for rank', myrank
        stop
      end if

      ! get group of processors in my group
      allocate (mygroup(groups(myg,1)))


      ie = 0
      do iat = groups(myg, 2), groups(myg+1, 2) - 1
        ie = ie + 1
        mygroup(ie) = iat
      end do


      ! create new communicator from group
      call mpi_comm_group(mpi_comm_world, mympi_group_world, ierr)
      if (ierr/=mpi_success) then
        write (*, *) 'Error in MPI_COMM_GROUP with code ', ierr
        write (*, *) 'Error in MPI_COMM_GROUP with code ', ierr
        stop 'Error in MPI_COMM_GROUP'
      end if

      call mpi_group_incl(mympi_group_world, groups(myg,1), mygroup, mympi_group_ie, ierr)
      if (ierr/=mpi_success) then
        write (*, *) 'Error in MPI_GROUP_INCL with code ', ierr
        write (*, *) 'Error in MPI_GROUP_INCL with code ', ierr
        stop 'Error in MPI_GROUP_INCL'
      end if
      call mpi_comm_create(mpi_comm_world, mympi_group_ie, mympi_comm_ie, ierr)
      if (ierr/=mpi_success) then
        write (*, *) 'Error in MPI_COMM_CREATE with code ', ierr
        write (*, *) 'Error in MPI_COMM_CREATE with code ', ierr
        stop 'Error in MPI_COMM_CREATE'
      end if
      call mpi_group_free(mympi_group_ie, ierr)
      if (ierr/=mpi_success) then
        write (*, *) 'Error in MPI_GROUP_FREE with code ', ierr
        write (*, *) 'Error in MPI_GROUP_FREE with code ', ierr
        stop 'Error in MPI_GROUP_FREE'
      end if

      ! get rank and size in new communicator
      call mpi_comm_rank(mympi_comm_ie, myrank_ie, ierr)
      if (ierr/=mpi_success) then
        write (*, *) 'Error in MPI_COMM_RANK(ie) with code ', ierr
        write (*, *) 'Error in MPI_COMM_RANK(ie) with code ', ierr
        stop 'Error in MPI_COMM_RANK(ie)'
      end if
      call mpi_comm_size(mympi_comm_ie, nranks_ie, ierr)
      if (ierr/=mpi_success) then
        write (*, *) 'Error in MPI_COMM_SIZE(ie) with code ', ierr
        write (*, *) 'Error in MPI_COMM_SIZE(ie) with code ', ierr
        stop 'Error in MPI_COMM_SIZE(ie)'
      end if

      ! create communicator to communicate between differen energies (i.e. different groups)
      call mpi_comm_split(mpi_comm_world, myrank_ie, myg, mympi_comm_at, ierr)
      if (ierr/=mpi_success) then
        write (*, *) 'Error in MPI_COMM_SPLIT', ierr
        write (*, *) 'Error in MPI_COMM_SPLIT ', ierr
        stop 'Error in MPI_COMM_SPLIT'
      end if
      call mpi_comm_rank(mympi_comm_at, myrank_atcomm, ierr)
      if (ierr/=mpi_success) then
        write (*, *) 'Error in MPI_COMM_RANK(at) with code ', ierr
        write (*, *) 'Error in MPI_COMM_RANK(at) with code ', ierr
        stop 'Error in MPI_COMM_RANK(at)'
      end if
      call mpi_comm_size(mympi_comm_at, nranks_atcomm, ierr)
      if (ierr/=mpi_success) then
        write (*, *) 'Error in MPI_COMM_SIZE(at) with code ', ierr
        write (*, *) 'Error in MPI_COMM_SIZE(at) with code ', ierr
        stop 'Error in MPI_COMM_SIZE(at)'
      end if

      nranks_at = ne
      myrank_at = myg - 1

    else if ((ne*nat)==nranks .and. (ne>1 .and. nat>1)) then
      ! no rest ranks, use cartesian grid

      rest = 0

      if (myrank==master) write (1337, *) 'create cartesian grid:', ne, nat, nranks
      call mpi_cart_create(mpi_comm_world, 2, [ne,nat], [.false.,.false.], .true., mympi_comm_grid, ierr)

      if (myrank==master) write (1337, *) 'MPI_Cart_sub'
      call mpi_cart_sub(mympi_comm_grid, [.true.,.false.], mympi_comm_at, ierr) ! row communicator
      call mpi_cart_sub(mympi_comm_grid, [.false.,.true.], mympi_comm_ie, ierr) ! col communicator

      if (myrank==master) write (1337, *) 'MPI_Comm_rank'
      call mpi_comm_rank(mympi_comm_ie, myrank_ie, ierr)
      call mpi_comm_rank(mympi_comm_at, myrank_at, ierr)

      if (myrank==master) write (1337, *) 'MPI_Comm_size'
      call mpi_comm_size(mympi_comm_ie, nranks_ie, ierr)
      call mpi_comm_size(mympi_comm_at, nranks_at, ierr)

      myrank_atcomm = myrank_at
      nranks_atcomm = nranks_at

    else if (ne==1 .and. nat==nranks) then
      ! no rest ranks, can use mpi_comm world entirely for atom parallelization

      rest = 0

      mympi_comm_ie = mpi_comm_world
      myrank_ie = myrank
      nranks_ie = nranks
      mympi_comm_at = mpi_comm_self
      myrank_at = 0
      nranks_at = 1

      myrank_atcomm = myrank_at
      nranks_atcomm = nranks_at

    else if (nat<nranks .and. (ne==1 .and. nat>1)) then
      ! technically have rest ranks but do not use them since ne==1 (wasts some resources!)
      if (myrank==master) then
        write (*, '(A,I5,A)') 'WARNING: nat<nranks but ne==1: ', nranks-nat, ' ranks are unused!'
        write (*, '(A)') 'Please consider modifying your jobsript to not waste any resources!'
        write (1337, '(A,I5,A)') 'WARNING: nat<nranks but ne==1: ', nranks-nat, ' ranks are unused!'
        write (1337, '(A)') 'Please consider modifying your jobsript to not waste any resources!'
      end if

      ! set rest artificially to zero (affects printout only)
      rest = 0
      if (myrank<nat) then
        call mpi_comm_split(mpi_comm_world, 1, myrank, mympi_comm_ie, ierr)
      else
        call mpi_comm_split(mpi_comm_world, myrank+1000, 0, mympi_comm_ie, ierr)
      end if

      call mpi_comm_rank(mympi_comm_ie, myrank_ie, ierr)
      call mpi_comm_size(mympi_comm_ie, nranks_ie, ierr)
      if (myrank>=nat) then
        myrank_ie = -1 ! overwrite this so that myrank==master checks return false for unused ranks
      end if

      ! set atom communicator manually to mpi_comm_self
      mympi_comm_at = mpi_comm_self
      nranks_at = 1
      myrank_at = 0
      ! fill atcomm helpes (only used for rest>0)
      nranks_atcomm = nranks_at
      myrank_atcomm = myrank_at

    else
      ! fallback: use all ranks for energy parallelization without rest ranks

      rest = 0

      mympi_comm_at = mpi_comm_world
      myrank_at = myrank
      nranks_at = nranks
      mympi_comm_ie = mpi_comm_self
      myrank_ie = 0
      nranks_ie = 1

      myrank_atcomm = myrank_at
      nranks_atcomm = nranks_at

    end if

    ! print output
    if (myrank==master) then
      write (1337, '(A)') '=================================================='
      write (1337, '(A,I5,A)') '    MPI parallelization: use', nranks, ' ranks'
      write (1337, '(A,I3,A,I4)') '    create processor array of size (nat x ne) ', nat, ' x', ne
      write (1337, '(A,I5,A,I5)') '    nranks_at: ', nranks_at, ', nranks_ie:', nranks_ie
      if (rest>0) write (1337, '(A,I3)') '                                   with rest', rest
      if (rest>0) write (1337, '(A,10I3)') '    divide rest onto last energy points (k,l,m):', ktake
      write (1337, '(A)') '                N_E'
      write (1337, '(A)') '         <--------------->'
      write (1337, '(A)') '       ^ ( | | | | | | | )'
      write (1337, '(A)') '       | ( | | | | | | | )'
      write (1337, '(A)') '  N_at | ( | | | | | | | )'
      write (1337, '(A)') '       | ( | | | | | | | )'
      if (rest==0) write (1337, '(A)') '       v ( | | | | | | | )'
      if (rest>0) write (1337, '(A)') '       v ( | | | | | | | )....'
      if (rest>0) write (1337, '(A)') '              ^    ( | | ) m  ^'
      if (rest>0) write (1337, '(A)') '            l |      ( | )    |'
      if (rest>0) write (1337, '(A)') '              v......( | )    | k'
      if (rest>0) write (1337, '(A)') '                       ( )    |'
      if (rest>0) write (1337, '(A)') '                       ( )....v'
    end if

  end subroutine create_newcomms_group_ie
#endif

#ifdef CPP_MPI
  !-------------------------------------------------------------------------------
  !> Summary: MPI communication for the `main1c` subroutine
  !> Author: 
  !> Category: communication, KKRhost 
  !> Deprecated: False
  !> MPI communication for the `main1c` subroutine
  !-------------------------------------------------------------------------------
  subroutine mympi_main1c_comm(irmd,lmpotd,natypd,lmaxd,lmaxd1,lmmaxd,npotd,ielast, &
    mmaxd,idoldau,natyp,krel,lmomvec,nmvecmax,nqdos,rho2ns,r2nef,espv,den,denlm,    &
    denmatc,denef,denefat,rhoorb,muorb,mvevi,mvevil,mvevief,mympi_comm)

    use :: mpi
    use :: mod_datatypes, only: dp
    implicit none

    integer, intent (in) :: krel !! Switch for non- (or scalar-) relativistic/relativistic (Dirac) program (0/1). Attention: several other parameters depend explicitly on KREL, they are set automatically Used for Dirac solver in ASA
    integer, intent (in) :: irmd    !! Maximum number of radial points
    integer, intent (in) :: npotd   !! (2*(KREL+KORBIT)+(1-(KREL+KORBIT))*NSPIND)*NATYP)
    integer, intent (in) :: nqdos
    integer, intent (in) :: natyp   !! Number of kinds of atoms in unit cell
    integer, intent (in) :: mmaxd
    integer, intent (in) :: lmaxd   !! Maximum l component in wave function expansion
    integer, intent (in) :: lmpotd  !! (lpot+1)**2
    integer, intent (in) :: natypd  !! Number of kinds of atoms in unit cell
    integer, intent (in) :: lmmaxd  !! (KREL+KORBIT+1)*(LMAX+1)**2
    integer, intent (in) :: ielast
    integer, intent (in) :: lmaxd1
    integer, intent (in) :: idoldau !! flag to perform LDA+U
    logical, intent (in) :: lmomvec
    integer, intent (in) :: nmvecmax
    integer, intent (in) :: mympi_comm
    ! .. In/Out variables
    real (kind=dp), intent (inout) :: denef
    real (kind=dp), dimension(natypd), intent (inout) :: denefat
    real (kind=dp), dimension(0:lmaxd1, npotd), intent (inout)            :: espv
    real (kind=dp), dimension(irmd*krel+(1-krel), natypd), intent (inout) :: rhoorb !! orbital density
    real (kind=dp), dimension(0:lmaxd1+1, 3, natypd), intent (inout)  :: muorb    !! orbital magnetic moment
    real (kind=dp), dimension(irmd, lmpotd, natypd, 2), intent (inout) :: r2nef   !! rho at FERMI energy
    real (kind=dp), dimension(irmd, lmpotd, natypd, 2), intent (inout) :: rho2ns  !! radial density
    complex (kind=dp), dimension(natypd, 3, nmvecmax), intent (inout) :: mvevi
    complex (kind=dp), dimension(natypd, 3, nmvecmax), intent (inout) :: mvevief
    complex (kind=dp), dimension(mmaxd, mmaxd, npotd), intent (inout) :: denmatc
    complex (kind=dp), dimension(0:lmaxd1, ielast, npotd, nqdos), intent (inout)  :: den
    complex (kind=dp), dimension(lmmaxd, ielast, npotd, nqdos), intent (inout)    :: denlm
    complex (kind=dp), dimension(0:lmaxd, natypd, 3, nmvecmax), intent (inout)    :: mvevil
    ! .. Local variables
    integer :: idim, ierr          ! , myrank_comm
    integer, parameter :: master = 0
    real (kind=dp), dimension(:), allocatable :: work1
    real (kind=dp), dimension(:, :), allocatable :: work2
    real (kind=dp), dimension(:, :, :), allocatable :: work3
    real (kind=dp), dimension(:, :, :, :), allocatable :: work4
    complex (kind=dp), dimension(:,:,:), allocatable :: work3c
    complex (kind=dp), dimension(:,:,:,:), allocatable :: work4c

    allocate (work4(irmd,lmpotd,natypd,2), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
    work4 = 0.0_dp
    idim = irmd*lmpotd*natypd*2
    call mpi_allreduce(rho2ns, work4, idim, mpi_double_precision, mpi_sum, mympi_comm, ierr)
    call dcopy(idim, work4, 1, rho2ns, 1)
    deallocate (work4)

    allocate (work4(irmd,lmpotd,natypd,2), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
    work4 = 0.0_dp
    idim = irmd*lmpotd*natypd*2
    call mpi_allreduce(r2nef, work4, idim, mpi_double_precision, mpi_sum, mympi_comm, ierr)
    call dcopy(idim, work4, 1, r2nef, 1)
    deallocate (work4)

    ! ESPV needs integration over atoms and energies -> MPI_COMM_WORLD
    allocate (work2(0:lmaxd+1,npotd), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
    work2 = 0.0_dp
    idim = (lmaxd+2)*npotd
    call mpi_allreduce(espv, work2, idim, mpi_double_precision, mpi_sum, mympi_comm, ierr)
    call dcopy(idim, work2, 1, espv, 1)
    deallocate (work2)

    allocate (work4c(0:lmaxd+1,ielast,npotd,nqdos), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
    work4c = (0.0_dp, 0.0_dp)
    idim = ielast*(lmaxd+2)*npotd*nqdos
    call mpi_allreduce(den, work4c, idim, mpi_double_complex, mpi_sum, mympi_comm, ierr)
    call zcopy(idim, work4c, 1, den, 1)
    deallocate (work4c)

    allocate (work4c(ielast,lmmaxd,nqdos,npotd), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
    work4c = (0.0_dp, 0.0_dp)
    idim = ielast*(lmmaxd)*npotd*nqdos
    call mpi_allreduce(denlm, work4c, idim, mpi_double_complex, mpi_sum, mympi_comm, ierr)
    call zcopy(idim, work4c, 1, denlm, 1)
    deallocate (work4c)

    if (idoldau==1) then
      allocate (work3c(mmaxd,mmaxd,npotd), stat=ierr)
      if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
      work3c = (0.0_dp, 0.0_dp)
      idim = mmaxd*mmaxd*npotd
      call mpi_allreduce(denmatc, work3c, idim, mpi_double_complex, mpi_sum, mympi_comm, ierr)
      call zcopy(idim, work3c, 1, denmatc, 1)
      deallocate (work3c)
    end if

    allocate (work1(1), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
    work1 = 0.0_dp
    idim = 1
    call mpi_allreduce(denef, work1, idim, mpi_double_precision, mpi_sum, mympi_comm, ierr)
    call dcopy(idim, work1, 1, denef, 1)
    deallocate (work1)

    allocate (work1(natyp), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
    work1 = 0.0_dp
    idim = natyp
    call mpi_allreduce(denefat, work1, idim, mpi_double_precision, mpi_sum, mympi_comm, ierr)
    call dcopy(idim, work1, 1, denefat, 1)
    deallocate (work1)

    if (krel==1) then
      allocate (work2(irmd,natypd), stat=ierr)
      if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
      work2 = 0.0_dp
      idim = irmd*natypd
      call mpi_allreduce(rhoorb, work2, idim, mpi_double_precision, mpi_sum, mympi_comm, ierr)
      call dcopy(idim, work2, 1, rhoorb, 1)
      deallocate (work2)

      allocate (work3(0:lmaxd+2,natypd,3), stat=ierr)
      if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
      work3 = 0.0_dp
      idim = (lmaxd+3)*natypd*3
      call mpi_allreduce(muorb, work3, idim, mpi_double_precision, mpi_sum, mympi_comm, ierr)
      call dcopy(idim, work3, 1, muorb, 1)
      deallocate (work3)

      if (lmomvec) then
        allocate (work3c(natypd,3,nmvecmax), stat=ierr)
        if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
        work3c = (0.0_dp, 0.0_dp)
        idim = natypd*3*nmvecmax
        call mpi_allreduce(mvevi, work3c, idim, mpi_double_complex, mpi_sum, mympi_comm, ierr)
        call zcopy(idim, work3c, 1, mvevi, 1)
        deallocate (work3c)

        allocate (work4c(lmaxd+1,natypd,3,nmvecmax), stat=ierr)
        if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
        work4c = (0.0_dp, 0.0_dp)
        idim = (lmaxd+1)*natypd*3*nmvecmax
        call mpi_allreduce(mvevil, work4c, idim, mpi_double_complex, mpi_sum, mympi_comm, ierr)
        call zcopy(idim, work4c, 1, mvevil, 1)
        deallocate (work4c)

        allocate (work3c(natypd,3,nmvecmax), stat=ierr)
        if (ierr/=0) stop '[mympi_main1c_comm] error allocating work array'
        work3c = (0.0_dp, 0.0_dp)
        idim = natypd*3*nmvecmax
        call mpi_allreduce(mvevief, work3c, idim, mpi_double_complex, mpi_sum, mympi_comm, ierr)
        call zcopy(idim, work3c, 1, mvevief, 1)
        deallocate (work3c)
      end if                       ! LMOMVEC
    end if                         ! KREL.EQ.1

  end subroutine mympi_main1c_comm
#endif

#ifdef CPP_MPI
  !-------------------------------------------------------------------------------
  !> Summary: MPI communication for the new solver in the `main1c` subroutine 
  !> Author: 
  !> Category: communication, KKRhost 
  !> Deprecated: False
  !> MPI communication for the new solver in the `main1c` subroutine 
  !-------------------------------------------------------------------------------
  subroutine mympi_main1c_comm_newsosol(nspin, korbit, irmdnew, lmpotd, lmaxd, lmaxd1, lmmax0d, lmmaxd, ielast, nqdos, den, denlm, gflle, rho2nsc, r2nefc, rho2int, espv, muorb, denorbmom, &
    denorbmomsp, denorbmomlm, denorbmomns, mympi_comm)

    use :: mpi
    use :: mod_datatypes, only: dp
    implicit none
    ! .. Input variables
    integer, intent (in) :: nqdos
    integer, intent (in) :: lmaxd   !! Maximum l component in wave function expansion
    integer, intent (in) :: lmpotd  !! (lpot+1)**2
    integer, intent (in) :: lmmax0d  !! (LMAX+1)**2
    integer, intent (in) :: ielast
    integer, intent (in) :: lmaxd1
    integer, intent (in) :: irmdnew
    integer, intent (in) :: lmmaxd !! (krel+korbit+1)*(LMAX+1)**2
    integer, intent (in) :: nspin
    integer, intent (in) :: korbit
    integer, intent (in) :: mympi_comm
    ! .. In/Out variables
    complex (kind=dp), dimension(nspin*(1+korbit)), intent (inout) :: rho2int
    complex (kind=dp), dimension(irmdnew, lmpotd, nspin*(1+korbit)), intent (inout) :: r2nefc
    complex (kind=dp), dimension(irmdnew, lmpotd, nspin*(1+korbit)), intent (inout) :: rho2nsc
    complex (kind=dp), dimension(0:lmaxd1, ielast, nqdos, nspin), intent (inout) :: den
    complex (kind=dp), dimension(lmmax0d, ielast, nqdos, nspin), intent (inout) :: denlm
    complex (kind=dp), dimension(lmmaxd, lmmaxd, ielast, nqdos), intent (inout) :: gflle
    real (kind=dp), dimension(3), intent (inout) :: denorbmom
    real (kind=dp), dimension(3), intent (inout) :: denorbmomns
    real (kind=dp), dimension(2, 3), intent (inout) :: denorbmomsp
    real (kind=dp), dimension(0:lmaxd1, 2), intent (inout) :: espv
    real (kind=dp), dimension(0:lmaxd1+1, 3), intent (inout) :: muorb !! orbital magnetic moment
    real (kind=dp), dimension(0:lmaxd, 3), intent (inout) :: denorbmomlm
    
    ! .. Local variables
    integer :: ierr, idim
    real (kind=dp), dimension(:, :, :, :), allocatable :: work
    complex (kind=dp), dimension(:, :, :, :), allocatable :: workc

    ! all with reduce instead of allreduce:
    ! complex (kind=dp) arrays
    idim = irmdnew*lmpotd*nspin*(1+korbit)
    allocate (workc(irmdnew,lmpotd,nspin*(1+korbit),1), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error allocating workc, r2nefc'
    workc = (0.0_dp, 0.0_dp)
    call mpi_reduce(r2nefc, workc(:,:,:,1), idim, mpi_double_complex, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error in MPI_REDUCE for r2nefc'
    call zcopy(idim, workc, 1, r2nefc, 1)
    deallocate (workc)

    idim = irmdnew*lmpotd*nspin*(1+korbit)
    allocate (workc(irmdnew,lmpotd,nspin*(1+korbit),1), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error allocating workc, rho2nsc'
    workc = (0.0_dp, 0.0_dp)
    call mpi_reduce(rho2nsc, workc, idim, mpi_double_complex, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error in MPI_REDUCE for rho2nsc'
    call zcopy(idim, workc, 1, rho2nsc, 1)
    deallocate (workc)

    idim = (lmaxd1+1)*ielast*nspin*nqdos
    allocate (workc(0:lmaxd1,ielast,nspin,nqdos), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error allocating workc, den'
    workc = (0.0_dp, 0.0_dp)
    call mpi_reduce(den, workc, idim, mpi_double_complex, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error in MPI_REDUCE for den'
    call zcopy(idim, workc, 1, den, 1)
    deallocate (workc)

    idim = lmmax0d*ielast*nspin*nqdos
    allocate (workc(lmmax0d,ielast,nspin,nqdos), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error allocating workc, denlm'
    workc = (0.0_dp, 0.0_dp)
    call mpi_reduce(denlm, workc, idim, mpi_double_complex, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error in MPI_REDUCE for denlm'
    call zcopy(idim, workc, 1, denlm, 1)
    deallocate (workc)

    idim = nspin*(1+korbit)
    allocate (workc(nspin*(1+korbit),1,1,1), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error allocating workc, rho2int'
    workc = (0.0_dp, 0.0_dp)
    call mpi_reduce(rho2int, workc(:,1,1,1), idim, mpi_double_complex, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error in MPI_REDUCE for rho2int'
    call zcopy(idim, workc, 1, rho2int, 1)
    deallocate (workc)

    idim = lmmaxd*lmmaxd*ielast*nqdos
    allocate (workc(lmmaxd,lmmaxd,ielast,nqdos), stat=ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error allocating workc, gflle'
    workc = (0.0_dp, 0.0_dp)
    call mpi_reduce(gflle, workc(:,:,:,:), idim, mpi_double_complex, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error in MPI_REDUCE for gflle'
    call zcopy(idim, workc, 1, gflle, 1)
    deallocate (workc)

    ! real (kind=dp) arrays
    idim = (lmaxd1+1)*2
    allocate (work(0:lmaxd1,2,1,1))
    work = 0.0_dp
    call mpi_reduce(espv, work, idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error in MPI_REDUCE for espv'
    call dcopy(idim, work, 1, espv, 1)
    deallocate (work)

    idim = (lmaxd1+2)*3
    allocate (work(0:lmaxd1+1,3,1,1))
    work = 0.0_dp
    call mpi_reduce(muorb, work, idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error in MPI_REDUCE for muorb'
    call dcopy(idim, work, 1, muorb, 1)
    deallocate (work)

    idim = 3
    allocate (work(3,1,1,1))
    work = 0.0_dp
    call mpi_reduce(denorbmom, work, idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error in MPI_REDUCE for denobrmom'
    call dcopy(idim, work, 1, denorbmom, 1)
    deallocate (work)

    idim = 2*3
    allocate (work(2,3,1,1))
    work = 0.0_dp
    call mpi_reduce(denorbmomsp, work, idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error in MPI_REDUCE for denorbmomsp'
    call dcopy(idim, work, 1, denorbmomsp, 1)
    deallocate (work)

    idim = 3
    allocate (work(3,1,1,1))
    work = 0.0_dp
    call mpi_reduce(denorbmomns, work, idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error in MPI_REDUCE for denorbmomns'
    call dcopy(idim, work, 1, denorbmomns, 1)
    deallocate (work)

    idim = (lmaxd+1)*3
    allocate (work(0:lmaxd1,3,1,1))
    work = 0.0_dp
    call mpi_reduce(denorbmomlm, work, idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol] Error in MPI_REDUCE for denorbmomlm'
    call dcopy(idim, work, 1, denorbmomlm, 1)
    deallocate (work)

  end subroutine mympi_main1c_comm_newsosol
#endif

#ifdef CPP_MPI
  !-------------------------------------------------------------------------------
  !> Summary: MPI communication for the new solver in the `main1c` subroutine 
  !> Author: 
  !> Category: communication, KKRhost 
  !> Deprecated: False
  !> MPI communication for the new solver in the `main1c` subroutine 
  !-------------------------------------------------------------------------------
  subroutine mympi_main1c_comm_newsosol2(lmaxd1,lmmaxd,ielast,nqdos,npotd,natypd,   &
    lmpotd,irmd,mmaxd,den,denlm,muorb,espv,r2nef,rho2ns,denefat,denef,denmatn,      &
    angles_new,totmoment,bconstr,mympi_comm)

    use :: mpi
    use :: mod_datatypes, only: dp
    implicit none

    integer, intent (in) :: irmd    !! Maximum number of radial points
    integer, intent (in) :: npotd   !! (2*(KREL+KORBIT)+(1-(KREL+KORBIT))*NSPIND)*NATYP)
    integer, intent (in) :: nqdos
    integer, intent (in) :: mmaxd
    integer, intent (in) :: lmpotd  !! (lpot+1)**2
    integer, intent (in) :: natypd  !! Number of kinds of atoms in unit cell
    integer, intent (in) :: lmmaxd  !! (KREL+KORBIT+1)*(LMAX+1)**2
    integer, intent (in) :: ielast
    integer, intent (in) :: lmaxd1
    integer, intent (in) :: mympi_comm
    ! .. In/Out variables
    real (kind=dp), intent (inout) :: denef
    complex (kind=dp), dimension(0:lmaxd1, ielast, nqdos, npotd), intent (inout) :: den
    complex (kind=dp), dimension(lmmaxd, ielast, nqdos, npotd), intent (inout) :: denlm
    complex (kind=dp), dimension(mmaxd, mmaxd, 2, 2, natypd), intent (inout) :: denmatn
    real (kind=dp), dimension(natypd), intent (inout)     :: denefat
    real (kind=dp), dimension(natypd, 2), intent (inout)  :: angles_new
    real (kind=dp), dimension(natypd), intent (inout)  :: totmoment
    real (kind=dp), dimension(4, natypd), intent (inout)  :: bconstr ! MdSD: constraining fields
    real (kind=dp), dimension(0:lmaxd1, npotd), intent (inout) :: espv
    real (kind=dp), dimension(0:lmaxd1+1, 3, natypd), intent (inout) :: muorb !! orbital magnetic moment
    real (kind=dp), dimension(irmd, lmpotd, natypd, 2), intent (inout) :: r2nef
    real (kind=dp), dimension(irmd, lmpotd, natypd, 2), intent (inout) :: rho2ns !! radial density

    integer :: ierr, idim
    real (kind=dp), dimension(:,:,:,:), allocatable :: work
    complex (kind=dp), dimension(:,:,:,:), allocatable :: workc
    complex (kind=dp), dimension(:,:,:,:,:), allocatable :: workc1

    ! complex (kind=dp) arrays
    idim = (1+lmaxd1)*ielast*nqdos*npotd
    allocate (workc(0:lmaxd1,ielast,nqdos,npotd))
    workc = (0.0_dp, 0.0_dp)
    call mpi_reduce(den, workc(:,:,:,:), idim, mpi_double_complex, mpi_sum, master, mympi_comm, ierr)
    ! CALL MPI_ALLREDUCE(den,workc(:,:,:,:),IDIM,MPI_DOUBLE_COMPLEX,MPI_SUM,mympi_comm,IERR)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol2] Error in MPI_REDUCE for den'
    call zcopy(idim, workc, 1, den, 1)
    deallocate (workc)

    idim = lmmaxd*ielast*nqdos*npotd
    allocate (workc(lmmaxd,ielast,nqdos,npotd))
    workc = (0.0_dp, 0.0_dp)
    call mpi_reduce(denlm, workc(:,:,:,:), idim, mpi_double_complex, mpi_sum, master, mympi_comm, ierr)
    ! CALL MPI_ALLREDUCE(denlm,workc(:,:,:,:),IDIM,MPI_DOUBLE_COMPLEX,MPI_SUM,mympi_comm,IERR)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol2] Error in MPI_REDUCE for denlm'
    call zcopy(idim, workc, 1, denlm, 1)
    deallocate (workc)

    idim = mmaxd*mmaxd*npotd
    allocate (workc1(mmaxd,mmaxd,2,2,natypd))
    workc1 = (0.0_dp, 0.0_dp)
    call mpi_reduce(denmatn, workc1(:,:,:,:,:), idim, mpi_double_complex, mpi_sum, master, mympi_comm, ierr)
    ! CALL MPI_ALLREDUCE(denmatn,workc1(:,:,:,:,:),IDIM,MPI_DOUBLE_COMPLEX,MPI_SUM,mympi_comm,IERR)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol2] Error in MPI_REDUCE for denmatn'
    call zcopy(idim, workc1, 1, denmatn, 1)
    deallocate (workc1)

    ! real (kind=dp) arrays
    idim = (lmaxd1+2)*3*natypd
    allocate (work(0:lmaxd1+1,3,natypd,1))
    work = 0.0_dp
    call mpi_reduce(muorb, work(:,:,:,1), idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    ! CALL MPI_ALLREDUCE(muorb,work(:,:,:,1),IDIM,MPI_DOUBLE_PRECISION,MPI_SUM,mympi_comm,IERR)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol2] Error in MPI_REDUCE for muorb'
    call dcopy(idim, work, 1, muorb, 1)
    deallocate (work)

    idim = (lmaxd1+1)*npotd
    allocate (work(0:lmaxd1,npotd,1,1))
    work = 0.0_dp
    call mpi_reduce(espv, work(:,:,1,1), idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    ! CALL MPI_ALLREDUCE(ESPV,work(:,:,1,1),IDIM,MPI_DOUBLE_PRECISION,MPI_SUM,mympi_comm,IERR)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol2] Error in MPI_REDUCE for espv'
    call dcopy(idim, work, 1, espv, 1)
    deallocate (work)

    idim = irmd*lmpotd*natypd*2
    allocate (work(irmd,lmpotd,natypd,2))
    work = 0.0_dp
    call mpi_reduce(r2nef, work(:,:,:,:), idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    ! CALL MPI_ALLREDUCE(r2nef,work(:,:,:,:),IDIM,MPI_DOUBLE_PRECISION,MPI_SUM,mympi_comm,IERR)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol2] Error in MPI_REDUCE for r2nef'
    call dcopy(idim, work, 1, r2nef, 1)
    deallocate (work)

    idim = irmd*lmpotd*natypd*2
    allocate (work(irmd,lmpotd,natypd,2))
    work = 0.0_dp
    call mpi_reduce(rho2ns, work(:,:,:,:), idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    ! CALL MPI_ALLREDUCE(RHO2NS,work(:,:,:,:),IDIM,MPI_DOUBLE_PRECISION,MPI_SUM,mympi_comm,IERR)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol2] Error in MPI_REDUCE for rho2ns'
    call dcopy(idim, work, 1, rho2ns, 1)
    deallocate (work)

    idim = natypd
    allocate (work(natypd,1,1,1))
    work = 0.0_dp
    call mpi_reduce(denefat, work(:,1,1,1), idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    ! CALL MPI_ALLREDUCE(denefat,work(:,1,1,1),IDIM,MPI_DOUBLE_PRECISION,MPI_SUM,mympi_comm,IERR)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol2] Error in MPI_REDUCE for denefat'
    call dcopy(idim, work, 1, denefat, 1)
    deallocate (work)

    idim = 1
    allocate (work(1,1,1,1))
    work = 0.0_dp
    call mpi_reduce(denef, work(:,1,1,1), idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    ! CALL MPI_ALLREDUCE(denef,work(:,1,1,1),IDIM,MPI_DOUBLE_PRECISION,MPI_SUM,mympi_comm,IERR)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol2] Error in MPI_REDUCE for denef'
    call dcopy(idim, work, 1, denef, 1)
    deallocate (work)

    idim = 2*natypd
    allocate (work(2,natypd,1,1))
    work = 0.0_dp
    call mpi_reduce(angles_new, work(:,:,1,1), idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    ! CALL MPI_ALLREDUCE(angles_new,work(:,:,1,1),IDIM,MPI_DOUBLE_PRECISION,MPI_SUM,mympi_comm,IERR)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol2] Error in MPI_REDUCE for angles_new'
    call dcopy(idim, work, 1, angles_new, 1)
    deallocate (work)

    idim = natypd
    allocate (work(natypd,1,1,1))
    work = 0.0_dp
    call mpi_reduce(totmoment, work(:,1,1,1), idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol2] Error in MPI_REDUCE for totmoment'
    call dcopy(idim, work, 1, totmoment, 1)
    deallocate (work)

    ! MdSD: constraining fields
    idim = 4*natypd
    allocate (work(4,natypd,1,1))
    work = 0.0_dp
    call mpi_reduce(bconstr, work(:,:,1,1), idim, mpi_double_precision, mpi_sum, master, mympi_comm, ierr)
    if (ierr/=0) stop '[mympi_main1c_comm_newsosol2] Error in MPI_REDUCE for bconstr'
    call dcopy(idim, work, 1, bconstr, 1)
    deallocate (work)

  end subroutine mympi_main1c_comm_newsosol2
#endif


#ifdef CPP_MPI
  !-------------------------------------------------------------------------------
  !> Summary: Subroutine to check if the MPI communication is working properly
  !> Author: 
  !> Category: communication, KKRhost 
  !> Deprecated: False
  !> Subroutine to check if the MPI communication is working properly
  !-------------------------------------------------------------------------------
  subroutine check_communication_pattern(mpiatom,mpiadapt,timings_1a,timings_1b,    &
    load_imbalance,nkmesh,kmesh_ie)

    use :: mpi
    use :: mod_datatypes, only: dp

    implicit none

    integer, intent (inout) :: mpiadapt
    logical, intent (inout) :: mpiatom
    real (kind=dp), dimension(:), intent (inout) :: timings_1b
    real (kind=dp), dimension(:,:), intent (inout) :: timings_1a
    integer, dimension(:), intent (out) :: load_imbalance
    integer, intent (in) :: nkmesh
    integer, dimension(:), intent (in) :: kmesh_ie

    real (kind=dp), dimension(:), allocatable :: t_average
    real (kind=dp), dimension(:,:), allocatable :: work
    integer, dimension(:), allocatable :: kmesh_n
    integer :: nat, ne, ne_1b, ierr, ie, iat, ik, iwork

    ! find some dimensions
    nat = size(timings_1a(1,:))
    ne = size(timings_1a(:,1))
    ne_1b = size(timings_1b)
    if (ne/=ne_1b) stop '[check_communication_pattern] Error in shapes of timing arrays'

    ! communicate timing arrays
    ! timings_1a
    iwork = ne*nat
    allocate (work(ne,nat), stat=ierr)
    if (ierr/=0) stop '[check_communication_pattern] error allocating work'
    call mpi_allreduce(timings_1a, work, iwork, mpi_double_precision, mpi_sum, mpi_comm_world, ierr)
    call dcopy(iwork, work, 1, timings_1a, 1)
    deallocate (work, stat=ierr)
    if (ierr/=0) stop '[check_communication_pattern] error deallocating work'
    ! timings_1b
    iwork = ne
    allocate (work(ne,1), stat=ierr)
    if (ierr/=0) stop '[check_communication_pattern] error allocating work'
    call mpi_allreduce(timings_1b, work(:,1), iwork, mpi_double_precision, mpi_sum, mpi_comm_world, ierr)
    call dcopy(iwork, work(:,1), 1, timings_1b, 1)
    deallocate (work, stat=ierr)
    if (ierr/=0) stop '[check_communication_pattern] error deallocating work'

    ! first find average over atoms of timings of 1a
    allocate (t_average(ne), stat=ierr)
    if (ierr/=0) stop '[check_communication_pattern] Error allocating t_average'

    do ie = 1, ne
      t_average(ie) = 0.0_dp
      do iat = 1, nat
        t_average(ie) = t_average(ie) + timings_1a(ie, iat)/real(nat, kind=dp)
      end do
      ! if(myrank==master) write(1337,'(A,i9,2ES23.16)') '[check_communication_pattern]: ie, time for 1a and 1b', ie, timings_1b(ie),t_average(ie)
      ! if(myrank==master .and. t_inc%i_write) write(1337,'(A,i9,2ES23.16)') '[check_communication_pattern]: ie, time for 1a and 1b', ie, timings_1b(ie),t_average(ie)
    end do

    ! find how many different energy points have the same kmesh
    allocate (kmesh_n(nkmesh), stat=ierr)
    if (ierr/=0) stop '[check_communication_pattern] Error allocating kmesh_n'
    kmesh_n(:) = 0
    ik = 1
    do ie = 1, ne
      ik = kmesh_ie(ie)
      kmesh_n(ik) = kmesh_n(ik) + 1
    end do

    ! average timings of energypoints in different kmesh
    load_imbalance(:) = 0
    do ie = 1, ne
      ik = kmesh_ie(ie)            ! multiply with large number to have nice distiguishable integers
      load_imbalance(ik) = load_imbalance(ik) + (int(10000.0_dp*((t_average(ie)+timings_1b(ie))/t_average(ie))))/kmesh_n(ik)
    end do
    if (myrank==master) write (*, *) 'load_imbalance', load_imbalance
    ! if(myrank==master .and. t_inc%i_write) write(1337,'(A,i9,1000I9)') '[check_communication_pattern] load imbalance:', load_imbalance

    ! set MPIatom and MPIadapt accordingly
    ! ...rest_at, rest_e => which fits actual load imbalance better => set MPIatom and MPIadapt
    if (myrank==master) write (*, *) mpiatom, mpiadapt

    ! finally deallocate work arrays
    deallocate (t_average, kmesh_n, stat=ierr)
    if (ierr/=0) stop '[check_communication_pattern] Error deallocating work arrays'

  end subroutine check_communication_pattern
#endif

  !-------------------------------------------------------------------------------
  !> Summary: Wrapper for work distribution over energies, also saves parallelization information for later use in `t_mpi_c_grid`
  !> Author: 
  !> Category: communication, KKRhost 
  !> Deprecated: False
  !> Wrapper for work distribution over energies, also saves parallelization
  !> information for later use in `t_mpi_c_grid`
  !-------------------------------------------------------------------------------
  subroutine distribute_work_energies(n_work, distribute_rest)

    use :: mod_types, only: t_mpi_c_grid
    use :: mod_profiling, only: memocc

    implicit none

    !! number of energies to be distributed
    integer, intent (in) :: n_work
    !! decide wether or not to distribute the rest rank or leave them idle
    logical, optional, intent (in) :: distribute_rest
    ! locals
    integer, dimension (0:nranks-1) :: ntot_pt, ioff_pt
    integer :: i_stat

#ifdef CPP_MPI
    if (present(distribute_rest)) then
      call distribute_linear_on_tasks(t_mpi_c_grid%nranks_at, t_mpi_c_grid%myrank_ie+t_mpi_c_grid%myrank_at, master, n_work, ntot_pt, ioff_pt, .true., distribute_rest)
    else
      call distribute_linear_on_tasks(t_mpi_c_grid%nranks_at, t_mpi_c_grid%myrank_ie+t_mpi_c_grid%myrank_at, master, n_work, ntot_pt, ioff_pt, .true., .true.)
    end if

    ! store in t_mpi_c_grid for later use
    if (.not. (allocated(t_mpi_c_grid%ntot_pt2) .or. allocated(t_mpi_c_grid%ioff_pt2))) then
      allocate (t_mpi_c_grid%ntot_pt2(0:t_mpi_c_grid%nranks_at-1), stat=i_stat)
      call memocc(i_stat, product(shape(t_mpi_c_grid%ntot_pt2))*kind(t_mpi_c_grid%ntot_pt2), 't_mpi_c_grid%ntot_pT2', 'distribute_work_energies')
      allocate (t_mpi_c_grid%ioff_pt2(0:t_mpi_c_grid%nranks_at-1), stat=i_stat)
      call memocc(i_stat, product(shape(t_mpi_c_grid%ioff_pt2))*kind(t_mpi_c_grid%ioff_pt2), 't_mpi_c_grid%ioff_pT2', 'distribute_work_energies')
    end if
    t_mpi_c_grid%ntot_pt2 = ntot_pt
    t_mpi_c_grid%ioff_pt2 = ioff_pt
    t_mpi_c_grid%ntot2 = t_mpi_c_grid%ntot_pt2(t_mpi_c_grid%myrank_at)
#else
    if (.not. (allocated(t_mpi_c_grid%ntot_pt2) .or. allocated(t_mpi_c_grid%ioff_pt2))) then
      allocate (t_mpi_c_grid%ntot_pt2(1), stat=i_stat)
      call memocc(i_stat, product(shape(t_mpi_c_grid%ntot_pt2))*kind(t_mpi_c_grid%ntot_pt2), 't_mpi_c_grid%ntot_pT2', 'distribute_work_energies')
      allocate (t_mpi_c_grid%ioff_pt2(1), stat=i_stat)
      call memocc(i_stat, product(shape(t_mpi_c_grid%ioff_pt2))*kind(t_mpi_c_grid%ioff_pt2), 't_mpi_c_grid%ioff_pT2', 'distribute_work_energies')
    end if
    t_mpi_c_grid%ntot2 = n_work
    t_mpi_c_grid%ntot_pt2 = n_work
    t_mpi_c_grid%ioff_pt2 = 0
#endif

  end subroutine distribute_work_energies


  !-------------------------------------------------------------------------------
  !> Summary: wrapper for work distribution over atoms, also saves parallelization information for later use in `t_mpi_c_grid`
  !> Author: 
  !> Category: communication, KKRhost 
  !> Deprecated: False
  !> wrapper for work distribution over atoms, also saves parallelization
  !> information for later use in `t_mpi_c_grid`
  !-------------------------------------------------------------------------------
  subroutine distribute_work_atoms(n_work, i1_start, i1_end, distribute_rest)

    use :: mod_types, only: t_mpi_c_grid
    use :: mod_profiling, only: memocc

    implicit none

    !! number of energies to be distributed
    integer, intent (in) :: n_work
    !! decide wether or not to distribute the rest rank or leave them idle
    logical, optional, intent (in) :: distribute_rest
    !! number of energies to be distributed
    integer, intent (out) :: i1_start
    !! number of energies to be distributed
    integer, intent (out) :: i1_end
    ! locals
    integer, dimension (0:nranks-1) :: ntot_pt, ioff_pt
    integer :: i_stat

#ifdef CPP_MPI
    if (present(distribute_rest)) then
      call distribute_linear_on_tasks(t_mpi_c_grid%nranks_ie, t_mpi_c_grid%myrank_ie+t_mpi_c_grid%myrank_at, master, n_work, ntot_pt, ioff_pt, .true., distribute_rest)
    else
      call distribute_linear_on_tasks(t_mpi_c_grid%nranks_ie, t_mpi_c_grid%myrank_ie+t_mpi_c_grid%myrank_at, master, n_work, ntot_pt, ioff_pt, .true., .true.)
    end if

    i1_start = ioff_pt(t_mpi_c_grid%myrank_ie) + 1
    i1_end = ioff_pt(t_mpi_c_grid%myrank_ie) + ntot_pt(t_mpi_c_grid%myrank_ie)

    ! store in t_mpi_c_grid for later use
    t_mpi_c_grid%ntot1 = ntot_pt(t_mpi_c_grid%myrank_ie)

    if (.not. (allocated(t_mpi_c_grid%ntot_pt1) .and. allocated(t_mpi_c_grid%ioff_pt1))) then
      allocate (t_mpi_c_grid%ntot_pt1(0:t_mpi_c_grid%nranks_ie-1), stat=i_stat)
      call memocc(i_stat, product(shape(t_mpi_c_grid%ntot_pt1))*kind(t_mpi_c_grid%ntot_pt1), 't_mpi_c_grid%ntot_pT1', 'distribute_work_atoms')
      allocate (t_mpi_c_grid%ioff_pt1(0:t_mpi_c_grid%nranks_ie-1), stat=i_stat)
      call memocc(i_stat, product(shape(t_mpi_c_grid%ioff_pt1))*kind(t_mpi_c_grid%ioff_pt1), 't_mpi_c_grid%ioff_pT1', 'distribute_work_atoms')
    end if
    t_mpi_c_grid%ntot_pt1 = ntot_pt
    t_mpi_c_grid%ioff_pt1 = ioff_pt
#else
    if (.not. (allocated(t_mpi_c_grid%ntot_pt1) .or. allocated(t_mpi_c_grid%ioff_pt1))) then
      allocate (t_mpi_c_grid%ntot_pt1(1), stat=i_stat)
      call memocc(i_stat, product(shape(t_mpi_c_grid%ntot_pt1))*kind(t_mpi_c_grid%ntot_pt1), 't_mpi_c_grid%ntot_pT1', 'distribute_work_atoms')
      allocate (t_mpi_c_grid%ioff_pt1(1), stat=i_stat)
      call memocc(i_stat, product(shape(t_mpi_c_grid%ioff_pt1))*kind(t_mpi_c_grid%ioff_pt1), 't_mpi_c_grid%ioff_pT1', 'distribute_work_atoms')
    end if
    t_mpi_c_grid%ntot1 = n_work
    t_mpi_c_grid%ntot_pt1 = n_work
    t_mpi_c_grid%ioff_pt1 = 0

    i1_start = 1
    i1_end = n_work
#endif

  end subroutine distribute_work_atoms

#ifdef CPP_MPI
  !-------------------------------------------------------------------------------
  !> Summary: MPI Briadcast of global variables
  !> Author: Jonathan Chico, Philipp Rüßmann
  !> Category: KKRhost, communication, initialization
  !> Deprecated: False ! This needs to be set to True for deprecated subroutines
  !>
  !> MPI broadcast routine for global variables (i.e. array dimensions etc.)
  !-------------------------------------------------------------------------------
  subroutine bcast_global_variables()
    use :: mpi
    use :: global_variables ! use all parameters
    implicit none
  
    integer :: n !! number of paramters that are broadcasted
    integer, allocatable :: blocklen1(:), etype1(:) !! blocklength of variuables in derived data type and list of MPI datatypes
    integer :: ierr !! error status
    integer :: mympitype1 !! derived data type for collective communication
    integer (kind=mpi_address_kind), allocatable :: disp1(:) !! MPI addresses
    integer (kind=mpi_address_kind) :: base !! base address of first entry
  
  
    n = 66
    allocate (blocklen1(n), etype1(n), disp1(n), stat=ierr)
    if (ierr/=0) stop 'error allocating arrays in bcast_global_variables'
  
    call mpi_get_address(n, disp1(1), ierr)
    call mpi_get_address(irid, disp1(2), ierr)
    call mpi_get_address(krel, disp1(3), ierr)
    call mpi_get_address(nfund, disp1(4), ierr)
    call mpi_get_address(ipand, disp1(5), ierr)
    call mpi_get_address(ngshd, disp1(6), ierr)
    call mpi_get_address(ncleb, disp1(7), ierr)
    call mpi_get_address(knoco, disp1(8), ierr)
    call mpi_get_address(iemxd, disp1(9), ierr)
    call mpi_get_address(irnsd, disp1(10), ierr)
    call mpi_get_address(nmaxd, disp1(11), ierr)
    call mpi_get_address(ishld, disp1(12), ierr)
    call mpi_get_address(naclsd, disp1(13), ierr)
    call mpi_get_address(nspotd, disp1(14), ierr)
    call mpi_get_address(ntperd, disp1(15), ierr)
    call mpi_get_address(ntrefd, disp1(16), ierr)
    call mpi_get_address(nsheld, disp1(17), ierr)
    call mpi_get_address(ncelld, disp1(18), ierr)
    call mpi_get_address(nspind, disp1(19), ierr)
    call mpi_get_address(knosph, disp1(20), ierr)
    call mpi_get_address(korbit, disp1(21), ierr)
    call mpi_get_address(kpoibz, disp1(22), ierr)
    call mpi_get_address(wlength, disp1(23), ierr)
    call mpi_get_address(nprincd, disp1(24), ierr)
    call mpi_get_address(nlayerd, disp1(25), ierr)
    call mpi_get_address(natomimpd, disp1(26), ierr)
    call mpi_get_address(lmaxd, disp1(27), ierr)
    call mpi_get_address(lmmaxd, disp1(28), ierr)
    call mpi_get_address(lmgf0d, disp1(29), ierr)
    call mpi_get_address(alm, disp1(30), ierr)
    call mpi_get_address(almgf0, disp1(31), ierr)
    call mpi_get_address(ndim_slabinv, disp1(32), ierr)
    call mpi_get_address(nembd, disp1(33), ierr)
    call mpi_get_address(nembd1, disp1(34), ierr)
    call mpi_get_address(nembd2, disp1(35), ierr)
    call mpi_get_address(nrd, disp1(36), ierr)
    call mpi_get_address(lm2d, disp1(37), ierr)
    call mpi_get_address(nclsd, disp1(38), ierr)
    call mpi_get_address(mmaxd, disp1(39), ierr)
    call mpi_get_address(npotd, disp1(40), ierr)
    call mpi_get_address(lmxspd, disp1(41), ierr)
    call mpi_get_address(lassld, disp1(42), ierr)
    call mpi_get_address(irmind, disp1(43), ierr)
    call mpi_get_address(nofgij, disp1(44), ierr)
    call mpi_get_address(nspindd, disp1(45), ierr)
    call mpi_get_address(nsatypd, disp1(46), ierr)
    call mpi_get_address(nrefd, disp1(47), ierr)
    call mpi_get_address(irmd, disp1(48), ierr)
    call mpi_get_address(naezd, disp1(49), ierr)
    call mpi_get_address(natypd, disp1(50), ierr)
    call mpi_get_address(lmpotd, disp1(51), ierr)
    call mpi_get_address(ntotd, disp1(52), ierr)
    call mpi_get_address(nrmaxd, disp1(53), ierr)
    call mpi_get_address(lpotd, disp1(54), ierr)
    call mpi_get_address(nchebd, disp1(55), ierr)
    call mpi_get_address(maxmshd, disp1(56), ierr)
    call mpi_get_address(kBdG, disp1(57), ierr)
    call mpi_get_address(ninit_broydenspin, disp1(58), ierr)
    call mpi_get_address(memlen_broydenspin, disp1(59), ierr)
    call mpi_get_address(nsimplemixfirst, disp1(60), ierr)
    call mpi_get_address(linterface, disp1(61), ierr)
    call mpi_get_address(lnc, disp1(62), ierr)
    call mpi_get_address(pot_ns_cutoff, disp1(63), ierr)
    call mpi_get_address(mixfac_broydenspin, disp1(64), ierr)
    call mpi_get_address(qbound_spin, disp1(65), ierr)
    call mpi_get_address(angles_cutoff, disp1(66), ierr)
  
    ! find displacements of variables
    base = disp1(1)
    disp1 = disp1 - base
  
    ! set length of variables in derived data type
    blocklen1(1:n) = 1
  
    ! set datatype of variables
    etype1(1:n-5) = mpi_integer
    etype1(n-4:n-3) = mpi_logical
    etype1(n-3:n) = mpi_double_precision
  
    ! create new Type structure for derived data type
    call mpi_type_create_struct(n, blocklen1, disp1, etype1, mympitype1, ierr)
    if (ierr/=mpi_success) stop 'Problem in create_mpimask_t_inc'
  
    ! commit new type
    call mpi_type_commit(mympitype1, ierr)
    if (ierr/=mpi_success) stop 'error commiting create_mpimask_t_inc'
  
    ! broadcast derived data type
  
    call mpi_bcast(n, 1, mympitype1, master, mpi_comm_world, ierr)
    if (ierr/=mpi_success) stop 'error brodcasting t_inc'
  
    ! finally free auxiliary type and deallocate working arrays
    call mpi_type_free(mympitype1, ierr)
    deallocate (blocklen1, etype1, disp1, stat=ierr)
    if (ierr/=0) stop 'error deallocating arrays in bcast_global_variables'

  end subroutine bcast_global_variables
#endif


end module mod_mympi