diff --git a/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp b/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp index 4783359c9..a7cb6d6bb 100644 --- a/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp +++ b/include/El/blas_like/level1/Copy/TranslateBetweenGrids.hpp @@ -3856,7 +3856,8 @@ void TranslateBetweenGrids( EL_DEBUG_CSE; /* Overview - + We broadcast the size of A to all the ranks in B to make sure that + all ranks in B subgrid has the correct size of A. Since we are using blocking communication, some care is required to avoid deadlocks. Let's start with a naive algorithm for [STAR,VC] matrices and optimize it in steps: @@ -3883,21 +3884,53 @@ void TranslateBetweenGrids( */ // Matrix dimensions - const Int m = A.Height(); - const Int n = A.Width(); + Int m = A.Height(); + Int n = A.Width(); + Int strideA = A.RowStride(); + Int ALDim = A.LDim(); + + // Create A metadata + Int recvMetaData[4]; + Int metaData[4]; + + SyncInfo syncGeneralMetaData = SyncInfo(); + mpi::Comm const& viewingCommB = B.Grid().ViewingComm(); + + const bool inAGrid = A.Participating(); + const bool inBGrid = B.Participating(); + + if(inAGrid) + { + metaData[0] = m; + metaData[1] = n; + metaData[2] = strideA; + metaData[3] = ALDim; + } + else + { + metaData[0] = 0; + metaData[1] = 0; + metaData[2] = 0; + metaData[3] = 0; + } + const std::vector sendMetaData (metaData, metaData + 4); + mpi::AllReduce( sendMetaData.data(), recvMetaData, 4, mpi::MAX, viewingCommB, syncGeneralMetaData); + m = recvMetaData[0]; + n = recvMetaData[1]; + strideA = recvMetaData[2]; + ALDim =recvMetaData[3]; + + B.Resize(m, n); const Int nLocA = A.LocalWidth(); const Int nLocB = B.LocalWidth(); // Return immediately if there is no local data - const bool inAGrid = A.Participating(); - const bool inBGrid = B.Participating(); if (!inAGrid && !inBGrid) { return; } // Compute the number of messages to send/recv - const Int strideA = A.RowStride(); const Int strideB = B.RowStride(); const Int strideGCD = GCD(strideA, strideB); const Int numSends = Min(strideB/strideGCD, nLocA); @@ -3913,7 +3946,6 @@ void TranslateBetweenGrids( // that we can match send/recv communicators. Since A's VC // communicator is not necessarily defined on every process, we // instead work with A's owning group. - mpi::Comm const& viewingCommB = B.Grid().ViewingComm(); mpi::Group owningGroupA = A.Grid().OwningGroup(); const int sizeA = A.Grid().Size(); vector viewingRanksA(sizeA), owningRanksA(sizeA); @@ -3976,7 +4008,7 @@ void TranslateBetweenGrids( // Copy data locally copy::util::InterleaveMatrix( m, messageWidth, - A.LockedBuffer(0,jLocA), 1, numSends*A.LDim(), + A.LockedBuffer(0,jLocA), 1, numSends*ALDim, B.Buffer(0,jLocB), 1, numRecvs*B.LDim(), syncInfo); } @@ -3984,7 +4016,7 @@ void TranslateBetweenGrids( // Send data to other rank copy::util::InterleaveMatrix( m, messageWidth, - A.LockedBuffer(0,jLocA), 1, numSends*A.LDim(), + A.LockedBuffer(0,jLocA), 1, numSends*ALDim, messageBuf.data(), 1, m, syncInfo); mpi::Send(