Skip to content

Commit

Permalink
prov/efa: update efa shm implementation to allocate fi_peer_srx_context
Browse files Browse the repository at this point in the history
The previous definition of the peer API didn't specify who allocated the
second peer structure (the one referenced by the peer). The shm implementation
was choosing to duplicate the imported srx and set it internally. The new
definition specifies that the owner handle the duplication of the peer resource
which is then imported into the peer to just set. Shm has been updated accordingly
but efa needs to be updated to create a second peer_srx and set the fields to the
original one for the peer to reference the owner_ops correctly.

This also adds a missing fi_close for the shm srx resource

Signed-off-by: Alexia Ingerson <[email protected]>
Signed-off-by: Shi Jin <[email protected]>
  • Loading branch information
aingerson authored and shijin-aws committed Oct 11, 2024
1 parent 472dc49 commit 527d320
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 22 deletions.
4 changes: 4 additions & 0 deletions prov/efa/src/rdm/efa_rdm_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ struct efa_rdm_ep {

/* shm provider fid */
struct fid_ep *shm_ep;
/* shm srx fid (shm-owned) */
struct fid_ep *shm_srx;
/* shm peer_srx (efa-owned) */
struct fid_peer_srx *shm_peer_srx;

size_t mtu_size;
size_t max_msg_size; /**< #FI_OPT_MAX_MSG_SIZE */
Expand Down
82 changes: 60 additions & 22 deletions prov/efa/src/rdm/efa_rdm_ep_fiops.c
Original file line number Diff line number Diff line change
Expand Up @@ -941,6 +941,42 @@ void efa_rdm_ep_remove_cq_ibv_cq_poll_list(struct efa_rdm_ep *ep)
}
}

/**
* @brief Clean efa_rdm_ep's shm ep level resources as the best effort
*
* @param efa_rdm_ep pointer to efa rdm ep
* @return int FI_SUCCESS on success, negative integer on failure
*/
static int efa_rdm_ep_close_shm_ep_resources(struct efa_rdm_ep *efa_rdm_ep)
{
int ret, retv = 0;

if (efa_rdm_ep->shm_srx) {
ret = fi_close(&efa_rdm_ep->shm_srx->fid);
if (ret) {
EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm srx\n");
retv = ret;
}
efa_rdm_ep->shm_srx = NULL;
}

if (efa_rdm_ep->shm_peer_srx) {
free(efa_rdm_ep->shm_peer_srx);
efa_rdm_ep->shm_peer_srx = NULL;
}

if (efa_rdm_ep->shm_ep) {
ret = fi_close(&efa_rdm_ep->shm_ep->fid);
if (ret) {
EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm ep\n");
retv = ret;
}
efa_rdm_ep->shm_ep = NULL;
}

return retv;
}

/**
* @brief implement the fi_close() API for the EFA RDM endpoint
* @param[in,out] fid Endpoint to close
Expand Down Expand Up @@ -981,13 +1017,9 @@ static int efa_rdm_ep_close(struct fid *fid)
retv = ret;
}

if (efa_rdm_ep->shm_ep) {
ret = fi_close(&efa_rdm_ep->shm_ep->fid);
if (ret) {
EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm EP\n");
retv = ret;
}
}
ret = efa_rdm_ep_close_shm_ep_resources(efa_rdm_ep);
if (ret)
retv = ret;

efa_rdm_ep_destroy_buffer_pools(efa_rdm_ep);

Expand Down Expand Up @@ -1053,12 +1085,8 @@ static void efa_rdm_ep_close_shm_resources(struct efa_rdm_ep *efa_rdm_ep)
struct efa_av *efa_av;
struct efa_rdm_cq *efa_rdm_cq;

if (efa_rdm_ep->shm_ep) {
ret = fi_close(&efa_rdm_ep->shm_ep->fid);
if (ret)
EFA_WARN(FI_LOG_EP_CTRL, "Unable to close shm ep\n");
efa_rdm_ep->shm_ep = NULL;
}

(void) efa_rdm_ep_close_shm_ep_resources(efa_rdm_ep);

efa_av = efa_rdm_ep->base_ep.av;
if (efa_av->shm_rdm_av) {
Expand Down Expand Up @@ -1230,7 +1258,6 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg)
int ret = 0;
struct fi_peer_srx_context peer_srx_context = {0};
struct fi_rx_attr peer_srx_attr = {0};
struct fid_ep *peer_srx_ep = NULL;
struct util_srx_ctx *srx_ctx;

switch (command) {
Expand Down Expand Up @@ -1296,26 +1323,36 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg)
* shared memory region.
*/
if (ep->shm_ep) {
peer_srx_context.srx = util_get_peer_srx(ep->peer_srx_ep);
ep->shm_peer_srx = calloc(1, sizeof(*ep->shm_peer_srx));
if (!ep->shm_peer_srx) {
ret = -FI_ENOMEM;
goto err_close_shm;
}
memcpy(ep->shm_peer_srx, util_get_peer_srx(ep->peer_srx_ep),
sizeof(*ep->shm_peer_srx));

peer_srx_context.size = sizeof(peer_srx_context);
peer_srx_context.srx = ep->shm_peer_srx;

peer_srx_attr.op_flags |= FI_PEER;
ret = fi_srx_context(efa_rdm_ep_domain(ep)->shm_domain,
&peer_srx_attr, &peer_srx_ep, &peer_srx_context);
&peer_srx_attr, &ep->shm_srx, &peer_srx_context);
if (ret)
goto err_unlock;
goto err_close_shm;
shm_ep_name_len = EFA_SHM_NAME_MAX;
ret = efa_shm_ep_name_construct(shm_ep_name, &shm_ep_name_len, &ep->base_ep.src_addr);
if (ret < 0)
goto err_unlock;
goto err_close_shm;
fi_setname(&ep->shm_ep->fid, shm_ep_name, shm_ep_name_len);

/* Bind srx to shm ep */
ret = fi_ep_bind(ep->shm_ep, &ep->peer_srx_ep->fid, 0);
ret = fi_ep_bind(ep->shm_ep, &ep->shm_srx->fid, 0);
if (ret)
goto err_unlock;
goto err_close_shm;

ret = fi_enable(ep->shm_ep);
if (ret)
goto err_unlock;
goto err_close_shm;
}
ofi_genlock_unlock(srx_ctx->lock);
break;
Expand All @@ -1326,7 +1363,8 @@ static int efa_rdm_ep_ctrl(struct fid *fid, int command, void *arg)

return ret;

err_unlock:
err_close_shm:
efa_rdm_ep_close_shm_ep_resources(ep);
ofi_genlock_unlock(srx_ctx->lock);
err_destroy_qp:
efa_base_ep_destruct_qp(&ep->base_ep);
Expand Down

0 comments on commit 527d320

Please sign in to comment.