From 62dbef69147bce25d3269871e37f19c8d9d9d561 Mon Sep 17 00:00:00 2001 From: Shi Jin Date: Tue, 26 Nov 2024 06:49:49 +0000 Subject: [PATCH] [v2.0.x] prov/efa: Skip rx pkt refill under certain threshold Libfabric currently refill the rx pkt pool in every cq read when there are >0 pkts to post, which makes it have chance to post ibv_recv 1-by-1 if there is only 1 pkt to post per cq read. Such 1-by-1 post is less performant than having a batch post once. This patch improves this strategy by introducing a threshold for the refilling. When When the number of internal rx pkts to post is lower than this threshold, the refill will be skipped. Also introduced FI_EFA_INTERNAL_RX_REFILL_THRESHOLD that allows tuning this parameter. Signed-off-by: Shi Jin (cherry picked from commit a149f51938a7fa65f05e00da3ce46358ccab91c0) --- man/fi_efa.7.md | 5 ++ prov/efa/src/efa_env.c | 4 ++ prov/efa/src/efa_env.h | 6 +++ prov/efa/src/rdm/efa_rdm_ep.h | 2 + prov/efa/src/rdm/efa_rdm_ep_utils.c | 6 ++- prov/efa/test/efa_unit_test_ep.c | 80 +++++++++++++++++++++++++++++ prov/efa/test/efa_unit_tests.c | 2 + prov/efa/test/efa_unit_tests.h | 2 + 8 files changed, 106 insertions(+), 1 deletion(-) diff --git a/man/fi_efa.7.md b/man/fi_efa.7.md index b6eefc19dc1..077f93c5515 100644 --- a/man/fi_efa.7.md +++ b/man/fi_efa.7.md @@ -338,6 +338,11 @@ for details. : Use device's unsolicited write recv functionality when it's available. (Default: 1). Setting this environment variable to 0 can disable this feature. +*FI_EFA_INTERNAL_RX_REFILL_THRESHOLD* +: The threshold that EFA provider will refill the internal rx pkt pool. (Default: 8). +When the number of internal rx pkts to post is lower than this threshold, +the refill will be skipped. + # SEE ALSO [`fabric`(7)](fabric.7.html), diff --git a/prov/efa/src/efa_env.c b/prov/efa/src/efa_env.c index 79a315c7cbe..ef6eedd57ec 100644 --- a/prov/efa/src/efa_env.c +++ b/prov/efa/src/efa_env.c @@ -39,6 +39,7 @@ struct efa_env efa_env = { .use_sm2 = false, .huge_page_setting = EFA_ENV_HUGE_PAGE_UNSPEC, .use_unsolicited_write_recv = 1, + .internal_rx_refill_threshold = 8, }; /** @@ -132,6 +133,7 @@ void efa_env_param_get(void) &efa_mr_max_cached_size); fi_param_get_size_t(&efa_prov, "tx_size", &efa_env.tx_size); fi_param_get_size_t(&efa_prov, "rx_size", &efa_env.rx_size); + fi_param_get_size_t(&efa_prov, "internal_rx_refill_threshold", &efa_env.internal_rx_refill_threshold); fi_param_get_bool(&efa_prov, "rx_copy_unexp", &efa_env.rx_copy_unexp); fi_param_get_bool(&efa_prov, "rx_copy_ooo", @@ -232,6 +234,8 @@ void efa_env_define() "will use huge page unless FI_EFA_FORK_SAFE is set to 1/on/true."); fi_param_define(&efa_prov, "use_unsolicited_write_recv", FI_PARAM_BOOL, "Use device's unsolicited write recv functionality when it's available. (Default: true)"); + fi_param_define(&efa_prov, "internal_rx_refill_threshold", FI_PARAM_SIZE_T, + "The threshold that EFA provider will refill the internal rx pkt pool. (Default: %zu)", efa_env.internal_rx_refill_threshold); } diff --git a/prov/efa/src/efa_env.h b/prov/efa/src/efa_env.h index 6fdd83a4a21..dbff4182292 100644 --- a/prov/efa/src/efa_env.h +++ b/prov/efa/src/efa_env.h @@ -79,6 +79,12 @@ struct efa_env { int use_sm2; enum efa_env_huge_page_setting huge_page_setting; int use_unsolicited_write_recv; + /** + * The threshold that EFA provider will refill the internal rx pkt pool. + * When the number of internal rx pkts to post is lower than this threshold, + * the refill will be skipped. + */ + size_t internal_rx_refill_threshold; }; extern struct efa_env efa_env; diff --git a/prov/efa/src/rdm/efa_rdm_ep.h b/prov/efa/src/rdm/efa_rdm_ep.h index d7a8fc5ddc2..b82741963ef 100644 --- a/prov/efa/src/rdm/efa_rdm_ep.h +++ b/prov/efa/src/rdm/efa_rdm_ep.h @@ -263,6 +263,8 @@ struct efa_domain *efa_rdm_ep_domain(struct efa_rdm_ep *ep) void efa_rdm_ep_post_internal_rx_pkts(struct efa_rdm_ep *ep); +int efa_rdm_ep_bulk_post_internal_rx_pkts(struct efa_rdm_ep *ep); + /** * @brief return whether this endpoint should write error cq entry for RNR. * diff --git a/prov/efa/src/rdm/efa_rdm_ep_utils.c b/prov/efa/src/rdm/efa_rdm_ep_utils.c index 83d66a23991..12c3c519983 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_utils.c +++ b/prov/efa/src/rdm/efa_rdm_ep_utils.c @@ -741,7 +741,11 @@ int efa_rdm_ep_bulk_post_internal_rx_pkts(struct efa_rdm_ep *ep) { int i, err; - if (ep->efa_rx_pkts_to_post == 0) + /** + * When efa_env.internal_rx_refill_threshold > efa_rdm_ep_get_rx_pool_size(ep), + * we should always refill when the pool is empty. + */ + if (ep->efa_rx_pkts_to_post < MIN(efa_env.internal_rx_refill_threshold, efa_rdm_ep_get_rx_pool_size(ep))) return 0; assert(ep->efa_rx_pkts_to_post + ep->efa_rx_pkts_posted <= ep->efa_max_outstanding_rx_ops); diff --git a/prov/efa/test/efa_unit_test_ep.c b/prov/efa/test/efa_unit_test_ep.c index 375ada94683..1ac044ce00c 100644 --- a/prov/efa/test/efa_unit_test_ep.c +++ b/prov/efa/test/efa_unit_test_ep.c @@ -1219,3 +1219,83 @@ void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(struct efa_res free(pkt_entry_vec); } + +static +void test_efa_rdm_ep_rx_refill_impl(struct efa_resource **state, int threshold, int rx_size) +{ + struct efa_resource *resource = *state; + struct efa_rdm_ep *efa_rdm_ep; + struct efa_rdm_pke *pkt_entry; + int i; + size_t threshold_orig; + + if (threshold < 4 || rx_size < 4) { + fprintf(stderr, "Too small threshold or rx_size for this test\n"); + fail(); + } + + threshold_orig = efa_env.internal_rx_refill_threshold; + + efa_env.internal_rx_refill_threshold = threshold; + + resource->hints = efa_unit_test_alloc_hints(FI_EP_RDM); + assert_non_null(resource->hints); + resource->hints->rx_attr->size = rx_size; + efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM, FI_VERSION(1, 14), + resource->hints, true, true); + + efa_rdm_ep = container_of(resource->ep, struct efa_rdm_ep, base_ep.util_ep.ep_fid); + assert_int_equal(efa_rdm_ep_get_rx_pool_size(efa_rdm_ep), rx_size); + + /* Grow the rx pool and post rx pkts */ + efa_rdm_ep_post_internal_rx_pkts(efa_rdm_ep); + assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, efa_rdm_ep_get_rx_pool_size(efa_rdm_ep)); + + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0); + for (i = 0; i < 4; i++) { + pkt_entry = ofi_bufpool_get_ibuf(efa_rdm_ep->efa_rx_pkt_pool, i); + assert_non_null(pkt_entry); + efa_rdm_pke_release_rx(pkt_entry); + } + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 4); + + efa_rdm_ep_bulk_post_internal_rx_pkts(efa_rdm_ep); + + /** + * efa_rx_pkts_to_post < FI_EFA_RX_REFILL_THRESHOLD + * pkts should NOT be refilled + */ + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 4); + assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, rx_size); + + /* releasing more pkts to reach the threshold or rx_size*/ + for (i = 4; i < MIN(rx_size, threshold); i++) { + pkt_entry = ofi_bufpool_get_ibuf(efa_rdm_ep->efa_rx_pkt_pool, i); + assert_non_null(pkt_entry); + efa_rdm_pke_release_rx(pkt_entry); + } + + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, i); + + efa_rdm_ep_bulk_post_internal_rx_pkts(efa_rdm_ep); + + /** + * efa_rx_pkts_to_post == min(FI_EFA_RX_REFILL_THRESHOLD, FI_EFA_RX_SIZE) + * pkts should be refilled + */ + assert_int_equal(efa_rdm_ep->efa_rx_pkts_to_post, 0); + assert_int_equal(efa_rdm_ep->efa_rx_pkts_posted, rx_size + i); + + /* recover the original value */ + efa_env.internal_rx_refill_threshold = threshold_orig; +} + +void test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size(struct efa_resource **state) +{ + test_efa_rdm_ep_rx_refill_impl(state, 8, 64); +} + +void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(struct efa_resource **state) +{ + test_efa_rdm_ep_rx_refill_impl(state, 128, 64); +} diff --git a/prov/efa/test/efa_unit_tests.c b/prov/efa/test/efa_unit_tests.c index 2232ea36059..2ada3f5d820 100644 --- a/prov/efa/test/efa_unit_tests.c +++ b/prov/efa/test/efa_unit_tests.c @@ -114,6 +114,8 @@ int main(void) cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_cancel, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_zcpy_recv_eagain, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), + cmocka_unit_test_setup_teardown(test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_dgram_cq_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_empty_cq, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), cmocka_unit_test_setup_teardown(test_ibv_cq_ex_read_failed_poll, efa_unit_test_mocks_setup, efa_unit_test_mocks_teardown), diff --git a/prov/efa/test/efa_unit_tests.h b/prov/efa/test/efa_unit_tests.h index d44368bc81f..96958b0255f 100644 --- a/prov/efa/test/efa_unit_tests.h +++ b/prov/efa/test/efa_unit_tests.h @@ -128,6 +128,8 @@ void test_efa_rdm_ep_close_discard_posted_recv(); void test_efa_rdm_ep_zcpy_recv_cancel(); void test_efa_rdm_ep_zcpy_recv_eagain(); void test_efa_rdm_ep_post_handshake_error_handling_pke_exhaustion(); +void test_efa_rdm_ep_rx_refill_threshold_smaller_than_rx_size(); +void test_efa_rdm_ep_rx_refill_threshold_larger_than_rx_size(); void test_dgram_cq_read_empty_cq(); void test_ibv_cq_ex_read_empty_cq(); void test_ibv_cq_ex_read_failed_poll();