diff --git a/build/bli_config.h.in b/build/bli_config.h.in index 41e76d2144..7dc67059f8 100644 --- a/build/bli_config.h.in +++ b/build/bli_config.h.in @@ -80,6 +80,10 @@ #define BLIS_ENABLE_JRIR_RR #endif +#if @enable_jrir_tlb@ +#define BLIS_ENABLE_JRIR_TLB +#endif + #if @enable_pba_pools@ #define BLIS_ENABLE_PBA_POOLS #else diff --git a/configure b/configure index 286a66123c..06201b4fa9 100755 --- a/configure +++ b/configure @@ -340,16 +340,36 @@ print_usage() echo " " echo " -r METHOD, --thread-part-jrir=METHOD" echo " " - echo " Request a method of assigning micropanels to threads in" - echo " the JR and IR loops. Valid values for METHOD are 'slab'" - echo " and 'rr'. Using 'slab' assigns (as much as possible)" - echo " contiguous regions of micropanels to each thread while" - echo " using 'rr' assigns micropanels to threads in a round-" - echo " robin fashion. The chosen method also applies during" - echo " the packing of A and B. The default method is 'slab'." - echo " NOTE: Specifying this option constitutes a request," - echo " which may be ignored in select situations if the" - echo " implementation has a good reason to do so." + echo " Select a strategy for partitioning computation in JR and" + echo " IR loops and assigning that computation to threads. Valid" + echo " values for METHOD are 'rr', 'slab', and 'tlb':" + echo " 'rr': Assign the computation associated with whole" + echo " columns of microtiles to threads in a round-" + echo " robin fashion. When selected, round-robin" + echo " assignment is also employed during packing." + echo " 'slab': Partition the computation into N contiguous" + echo " regions, where each region contains a whole" + echo " number of microtile columns, and assign one" + echo " region to each thread. For some operations, the" + echo " number of microtile columns contained within a" + echo " given region may differ from that of other" + echo " regions, depending on how much work is implied" + echo " by each region. When selected, slab assignment" + echo " is also employed during packing." + echo " 'tlb': Tile-level load balancing is similar to slab," + echo " except that regions will be divided at a more" + echo " granular level (individual microtiles instead" + echo " of whole columns of microtiles) to ensure more" + echo " equitable assignment of work to threads. When" + echo " selected, tlb will only be employed for level-3" + echo " operations except trsm; due to practical and" + echo " algorithmic limitations, slab partitioning will" + echo " be used instead during packing and for trsm." + echo " The default strategy is 'slab'. NOTE: Specifying this" + echo " option constitutes a request, which may be ignored in" + echo " select situations if implementation has a good reason to" + echo " do so. (See description of 'tlb' above for an example of" + echo " this.)" echo " " echo " --disable-trsm-preinversion, --enable-trsm-preinversion" echo " " @@ -3731,16 +3751,20 @@ main() # Check the method of assigning micropanels to threads in the JR and IR # loops. - enable_jrir_slab_01=0 enable_jrir_rr_01=0 - if [ "x${thread_part_jrir}" = "xslab" ]; then - echo "${script_name}: requesting slab threading in jr and ir loops." - enable_jrir_slab_01=1 - elif [ "x${thread_part_jrir}" = "xrr" ]; then - echo "${script_name}: requesting round-robin threading in jr and ir loops." + enable_jrir_slab_01=0 + enable_jrir_tlb_01=0 + if [ "x${thread_part_jrir}" = "xrr" ]; then + echo "${script_name}: requesting round-robin (rr) work partitioning in jr and/or ir loops." enable_jrir_rr_01=1 + elif [ "x${thread_part_jrir}" = "xslab" ]; then + echo "${script_name}: requesting slab work partitioning in jr and/or ir loops." + enable_jrir_slab_01=1 + elif [ "x${thread_part_jrir}" = "xtlb" ]; then + echo "${script_name}: requesting tile-level load balancing (tlb) in unified jr+ir loop." + enable_jrir_tlb_01=1 else - echo "${script_name}: *** Unsupported method of thread partitioning in jr and ir loops: ${thread_part_jrir}." + echo "${script_name}: *** Unsupported method of work partitioning in jr/ir loops: ${thread_part_jrir}." exit 1 fi @@ -4177,8 +4201,9 @@ main() | sed -e "s/@enable_pthreads_as_def@/${enable_pthreads_as_def_01}/g" \ | sed -e "s/@enable_hpx@/${enable_hpx_01}/g" \ | sed -e "s/@enable_hpx_as_def@/${enable_hpx_as_def_01}/g" \ - | sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \ | sed -e "s/@enable_jrir_rr@/${enable_jrir_rr_01}/g" \ + | sed -e "s/@enable_jrir_slab@/${enable_jrir_slab_01}/g" \ + | sed -e "s/@enable_jrir_tlb@/${enable_jrir_tlb_01}/g" \ | sed -e "s/@enable_pba_pools@/${enable_pba_pools_01}/g" \ | sed -e "s/@enable_sba_pools@/${enable_sba_pools_01}/g" \ | sed -e "s/@enable_mem_tracing@/${enable_mem_tracing_01}/g" \ diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h index 80878fba01..7d73bf903e 100644 --- a/frame/1m/packm/bli_packm.h +++ b/frame/1m/packm/bli_packm.h @@ -39,7 +39,6 @@ #include "bli_packm_init.h" #include "bli_packm_int.h" #include "bli_packm_scalar.h" -#include "bli_packm_thrinfo.h" #include "bli_packm_part.h" diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c index b8f4f945d9..561988e7f7 100644 --- a/frame/1m/packm/bli_packm_blk_var1.c +++ b/frame/1m/packm/bli_packm_blk_var1.c @@ -170,11 +170,11 @@ void bli_packm_blk_var1 const dim_t tid = bli_thrinfo_work_id( thread ); // Determine the thread range and increment using the current thread's - // packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + // packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr() // will depend on whether slab or round-robin partitioning was requested // at configure-time. dim_t it_start, it_end, it_inc; - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); char* p_begin = p_cast; @@ -195,10 +195,10 @@ void bli_packm_blk_var1 char* c_begin = c_cast + (ic )*incc*dt_c_size; - // Hermitian/symmetric and general packing may use slab or - // round-robin (bli_packm_my_iter()), depending on which was - // selected at configure-time. - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) + // Hermitian/symmetric and general packing may use slab or round- + // robin (bli_is_my_iter()), depending on which was selected at + // configure-time. + if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) { packm_ker_cast( bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc, diagc, @@ -286,9 +286,9 @@ void bli_packm_blk_var1 // We nudge the imaginary stride up by one if it is odd. is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 ); - // NOTE: We MUST use round-robin work allocation (bli_packm_my_iter_rr()) + // NOTE: We MUST use round-robin work allocation (bli_is_my_iter_rr()) // when packing micropanels of a triangular matrix. - if ( bli_packm_my_iter_rr( it, it_start, it_end, tid, nt ) ) + if ( bli_is_my_iter_rr( it, tid, nt ) ) { packm_ker_cast( strucc, diagc, diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c index e47f65aeaf..67b33f407c 100644 --- a/frame/3/bli_l3_sup_packm_var.c +++ b/frame/3/bli_l3_sup_packm_var.c @@ -155,10 +155,10 @@ void PASTEMAC(ch,varname) \ dim_t it_start, it_end, it_inc; \ \ /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ \ /* Iterate over every logical micropanel in the source matrix. */ \ for ( ic = ic0, it = 0; it < n_iter; \ @@ -175,9 +175,9 @@ void PASTEMAC(ch,varname) \ panel_len_i = panel_len_full; \ panel_len_max_i = panel_len_max; \ \ - /* The definition of bli_packm_my_iter() will depend on whether slab + /* The definition of bli_is_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ f \ ( \ @@ -398,10 +398,10 @@ void PASTEMAC(ch,varname) \ dim_t it_start, it_end, it_inc; \ \ /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ \ /* Iterate over every logical micropanel in the source matrix. */ \ for ( it = 0; it < n_iter; it += 1 ) \ @@ -412,9 +412,9 @@ void PASTEMAC(ch,varname) \ ctype* p_use = p_begin; \ \ { \ - /* The definition of bli_packm_my_iter() will depend on whether slab + /* The definition of bli_is_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ PASTEMAC2(ch,scal2v,BLIS_TAPI_EX_SUF) \ ( \ diff --git a/frame/3/bli_l3_sup_var12.c b/frame/3/bli_l3_sup_var12.c index d65482243b..4162c3d33e 100644 --- a/frame/3/bli_l3_sup_var12.c +++ b/frame/3/bli_l3_sup_var12.c @@ -357,11 +357,11 @@ void PASTEMAC(ch,varname) \ object. */ \ /* ctype* a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \ - if ( bli_is_last_iter( i, ir_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_slrr( i, ir_iter, 0, 1 ) ) \ { \ a2 = a_00; \ b2 = bli_gemm_get_next_b_upanel( b_jr, jrstep_b, jr_inc ); \ - if ( bli_is_last_iter( j, jr_iter, 0, 1 ) ) \ + if ( bli_is_last_iter_slrr( j, jr_iter, 0, 1 ) ) \ b2 = b_00; \ } \ \ diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h index b1290df508..2ea7a3fc23 100644 --- a/frame/3/bli_l3_thrinfo.h +++ b/frame/3/bli_l3_thrinfo.h @@ -39,22 +39,22 @@ // gemm -// NOTE: The definition of bli_gemm_get_next_?_upanel() does not need to -// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) // gemmt -// NOTE: The definition of bli_gemmt_get_next_?_upanel() does not need to -// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_gemmt_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_gemmt_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) +// NOTE: Here, we assume NO parallelism in the IR loop. +#define bli_gemmt_l_wrap_a_upanel( a0, step, doff_j, mr, nr ) \ + ( a0 + ( (-doff_j + 1*nr) / mr ) * step ) +#define bli_gemmt_u_wrap_a_upanel( a0, step, doff_j, mr, nr ) \ + ( a0 ) + // trmm -// NOTE: The definition of bli_trmm_get_next_?_upanel() does not need to -// change depending on BLIS_ENABLE_JRIR_SLAB / BLIS_ENABLE_JRIR_RR. #define bli_trmm_get_next_a_upanel( a1, step, inc ) ( a1 + step * inc ) #define bli_trmm_get_next_b_upanel( b1, step, inc ) ( b1 + step * inc ) diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c index bd8d97d13d..b9c231cf72 100644 --- a/frame/3/gemm/bli_gemm_cntl.c +++ b/frame/3/gemm/bli_gemm_cntl.c @@ -61,10 +61,25 @@ cntl_t* bli_gemmbp_cntl_create void_fp macro_kernel_fp; // Choose the default macrokernel based on the operation family... - if ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2; - else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2; - else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2; - else /* should never execute */ macro_kernel_fp = NULL; + if ( family == BLIS_GEMM ) macro_kernel_fp = + #ifdef BLIS_ENABLE_JRIR_TLB + bli_gemm_ker_var2b; + #else // ifdef ( _SLAB || _RR ) + bli_gemm_ker_var2; + #endif + else if ( family == BLIS_GEMMT ) macro_kernel_fp = + #ifdef BLIS_ENABLE_JRIR_TLB + bli_gemmt_x_ker_var2b; + #else // ifdef ( _SLAB || _RR ) + bli_gemmt_x_ker_var2; + #endif + else if ( family == BLIS_TRMM ) macro_kernel_fp = + #ifdef BLIS_ENABLE_JRIR_TLB + bli_trmm_xx_ker_var2b; + #else // ifdef ( _SLAB || _RR ) + bli_trmm_xx_ker_var2; + #endif + else /* should never execute */ macro_kernel_fp = NULL; // ...unless a non-NULL kernel function pointer is passed in, in which // case we use that instead. diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c index d596950819..3e862e6c59 100644 --- a/frame/3/gemm/bli_gemm_ker_var2.c +++ b/frame/3/gemm/bli_gemm_ker_var2.c @@ -47,7 +47,7 @@ typedef void (*xpbys_mxn_vft) #undef GENTFUNC2 #define GENTFUNC2(ctypex,ctypey,chx,chy,op) \ \ -void PASTEMAC2(chx,chy,op) \ +BLIS_INLINE void PASTEMAC2(chx,chy,op) \ ( \ dim_t m, \ dim_t n, \ @@ -77,31 +77,31 @@ static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxn_fn); void bli_gemm_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { num_t dt_exec = bli_obj_exec_dt( c ); num_t dt_c = bli_obj_dt( c ); - pack_t schema_a = bli_obj_pack_schema( a ); - pack_t schema_b = bli_obj_pack_schema( b ); + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); dim_t m = bli_obj_length( c ); dim_t n = bli_obj_width( c ); dim_t k = bli_obj_width( a ); const char* a_cast = bli_obj_buffer_at_off( a ); - inc_t is_a = bli_obj_imag_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); dim_t pd_a = bli_obj_panel_dim( a ); inc_t ps_a = bli_obj_panel_stride( a ); const char* b_cast = bli_obj_buffer_at_off( b ); - inc_t is_b = bli_obj_imag_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); dim_t pd_b = bli_obj_panel_dim( b ); inc_t ps_b = bli_obj_panel_stride( b ); @@ -116,8 +116,7 @@ void bli_gemm_ker_var2 // NOTE: We know that the internal scalars of A and B are already of the // target datatypes because the necessary typecasting would have already // taken place during bli_packm_init(). - obj_t scalar_a; - obj_t scalar_b; + obj_t scalar_a, scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); @@ -217,22 +216,19 @@ void bli_gemm_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_c_size; - inc_t cstep_c = cs_c * NR * dt_c_size; + const inc_t rstep_c = rs_c * MR * dt_c_size; + const inc_t cstep_c = cs_c * NR * dt_c_size; auxinfo_t aux; @@ -255,20 +251,19 @@ void bli_gemm_ker_var2 thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t ir_nt = bli_thrinfo_n_way( caucus ); - dim_t ir_tid = bli_thrinfo_work_id( caucus ); + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - dim_t jr_start, jr_end; - dim_t ir_start, ir_end; - dim_t jr_inc, ir_inc; + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; // Determine the thread range and increment for the 2nd and 1st loops. - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -276,7 +271,9 @@ void bli_gemm_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + // Compute the current microtile's width. + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -287,15 +284,17 @@ void bli_gemm_ker_var2 const char* a1 = a_cast + i * rstep_a; char* c11 = c1 + i * rstep_c; - const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + // Compute the current microtile's length. + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); // Compute the addresses of the next panels of A and B. const char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) + if ( bli_is_last_iter_slrr( i, ir_end, ir_tid, ir_nt ) ) { a2 = a_cast; b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) + if ( bli_is_last_iter_slrr( j, jr_end, jr_tid, jr_nt ) ) b2 = b_cast; } @@ -342,22 +341,20 @@ void bli_gemm_ker_var2 ( cntx_t* )cntx ); - // Accumulate to C with type-casting. + // Accumulate to C with typecasting. xpbys_mxn[ dt_exec ][ dt_c ] ( - m_cur, n_cur, - &ct, rs_ct, cs_ct, - ( void* )beta_cast, - c11, rs_c, cs_c + m_cur, n_cur, + &ct, rs_ct, cs_ct, + ( void* )beta_cast, + c11, rs_c, cs_c ); } } } - -/* -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); -PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); -*/ } +//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: b1", k, NR, b1, NR, 1, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: a1", MR, k, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); + diff --git a/frame/3/gemm/bli_gemm_ker_var2b.c b/frame/3/gemm/bli_gemm_ker_var2b.c new file mode 100644 index 0000000000..50375708af --- /dev/null +++ b/frame/3/gemm/bli_gemm_ker_var2b.c @@ -0,0 +1,379 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +typedef void (*xpbys_mxn_vft) + ( + dim_t m, + dim_t n, + void* x, inc_t rs_x, inc_t cs_x, + void* b, + void* y, inc_t rs_y, inc_t cs_y + ); + +#undef GENTFUNC2 +#define GENTFUNC2(ctypex,ctypey,chx,chy,op) \ +\ +BLIS_INLINE void PASTEMAC2(chx,chy,op) \ + ( \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + void* b, \ + void* y, inc_t rs_y, inc_t cs_y \ + ) \ +{ \ + ctypex* restrict x_cast = x; \ + ctypey* restrict b_cast = b; \ + ctypey* restrict y_cast = y; \ +\ + PASTEMAC3(chx,chy,chy,xpbys_mxn) \ + ( \ + m, n, \ + x_cast, rs_x, cs_x, \ + b_cast, \ + y_cast, rs_y, cs_y \ + ); \ +} + +INSERT_GENTFUNC2_BASIC0(xpbys_mxnb_fn); +INSERT_GENTFUNC2_MIXDP0(xpbys_mxnb_fn); + +static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxnb_fn); + + +void bli_gemm_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + num_t dt_exec = bli_obj_exec_dt( c ); + num_t dt_c = bli_obj_dt( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const char* a_cast = bli_obj_buffer_at_off( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + dim_t pd_a = bli_obj_panel_dim( a ); + inc_t ps_a = bli_obj_panel_stride( a ); + + const char* b_cast = bli_obj_buffer_at_off( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + dim_t pd_b = bli_obj_panel_dim( b ); + inc_t ps_b = bli_obj_panel_stride( b ); + + char* c_cast = bli_obj_buffer_at_off( c ); + inc_t rs_c = bli_obj_row_stride( c ); + inc_t cs_c = bli_obj_col_stride( c ); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Detach and multiply the scalars attached to A and B. + // NOTE: We know that the internal scalars of A and B are already of the + // target datatypes because the necessary typecasting would have already + // taken place during bli_packm_init(). + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + // NOTE: We know that scalar_b is of type dt_exec due to the above code + // that casts the scalars of A and B to dt_exec via scalar_a and scalar_b, + // and we know that the internal scalar in C is already of the type dt_c + // due to the casting in the implementation of bli_obj_scalar_attach(). + const char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b ); + const char* beta_cast = bli_obj_internal_scalar_buffer( c ); + + // If 1m is being employed on a column- or row-stored matrix with a + // real-valued beta, we can use the real domain macro-kernel, which + // eliminates a little overhead associated with the 1m virtual + // micro-kernel. + // Only employ this optimization if the storage datatype of C is + // equal to the execution/computation datatype. +#if 1 + if ( bli_cntx_method( cntx ) == BLIS_1M ) + { + bli_gemm_ind_recast_1m_params + ( + &dt_exec, + &dt_c, + schema_a, + c, + &m, &n, &k, + &pd_a, &ps_a, + &pd_b, &ps_b, + &rs_c, &cs_c, + cntx + ); + } +#endif + +#ifdef BLIS_ENABLE_GEMM_MD + // Tweak parameters in select mixed domain cases (rcc, crc, ccr). + if ( bli_cntx_method( cntx ) == BLIS_NAT ) + { + bli_gemm_md_ker_var2_recast + ( + &dt_exec, + bli_obj_dt( a ), + bli_obj_dt( b ), + &dt_c, + &m, &n, &k, + &pd_a, &ps_a, + &pd_b, &ps_b, + c, + &rs_c, &cs_c + ); + } +#endif + + const siz_t dt_size = bli_dt_size( dt_exec ); + const siz_t dt_c_size = bli_dt_size( dt_c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + + // Query the params field from the obj_t. If it is non-NULL, grab the ukr + // field of the params struct. If that function pointer is non-NULL, use it + // as our microkernel instead of the default microkernel queried from the + // cntx above. + const gemm_ker_params_t* params = bli_obj_ker_params( c ); + gemm_ukr_vft user_ukr = params ? params->ukr : NULL; + if ( user_ukr ) gemm_ukr = user_ukr; + + // Temporary C buffer for edge cases. Note that the strides of this + // temporary buffer are set so that they match the storage of the + // original C matrix. For example, if C is column-stored, ct will be + // column-stored as well. + char ct[ BLIS_STACK_BUF_MAX_SIZE ] + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); + const inc_t rs_ct = ( col_pref ? 1 : NR ); + const inc_t cs_ct = ( col_pref ? MR : 1 ); + const char* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); + + // + // Assumptions/assertions: + // rs_a == 1 + // cs_a == PACKMR + // pd_a == MR + // ps_a == stride to next micro-panel of A + // rs_b == PACKNR + // cs_b == 1 + // pd_b == NR + // ps_b == stride to next micro-panel of B + // rs_c == (no assumptions) + // cs_c == (no assumptions) + // + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_c_size; + const inc_t cstep_c = cs_c * NR * dt_c_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // Save the imaginary stride of A and B to the auxinfo_t object. + bli_auxinfo_set_is_a( is_a, &aux ); + bli_auxinfo_set_is_b( is_b, &aux ); + + // Save the virtual microkernel address and the params. + bli_auxinfo_set_ukr( gemm_ukr, &aux ); + bli_auxinfo_set_params( params, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Notice that this variant doesn't utilize + // parallelism in the 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + // Determine the starting microtile offsets and number of microtiles to + // compute for each thread. Note that assignment of microtiles is done + // according to the tlb policy. + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_d( jr_nt, jr_tid, m_iter, n_iter, MR, NR, &jr_st, &ir_st ); + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + // Compute the current microtile's width. + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the current microtile's length. + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_gemm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Edge case handling now occurs within the microkernel itself, but + // we must still explicitly accumulate to a temporary microtile in + // situations where a virtual microkernel is being used, such as + // during the 1m method or some cases of mixed datatypes. + if ( dt_exec == dt_c ) + { + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + else + { + // Invoke the gemm micro-kernel. + gemm_ukr + ( + MR, + NR, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )zero, + &ct, rs_ct, cs_ct, + &aux, + ( cntx_t* )cntx + ); + + // Accumulate to C with typecasting. + xpbys_mxn[ dt_exec ][ dt_c ] + ( + m_cur, n_cur, + &ct, rs_ct, cs_ct, + ( void* )beta_cast, + c11, rs_c, cs_c + ); + } + + ut += 1; + if ( ut == n_ut_for_me ) return; + } + + i = 0; + } +} + +//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: b1", k, NR, b1, NR, 1, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: a1", MR, k, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "gemm_ker_var2b: c after", m_cur, n_cur, c11, rs_c, cs_c, "%4.1f", "" ); diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h index 24f7ecfb9e..f69327db0c 100644 --- a/frame/3/gemm/bli_gemm_var.h +++ b/frame/3/gemm/bli_gemm_var.h @@ -65,6 +65,7 @@ GENPROT( gemm_blk_var1 ) GENPROT( gemm_blk_var2 ) GENPROT( gemm_blk_var3 ) -GENPROT( gemm_ker_var1 ) GENPROT( gemm_ker_var2 ) +GENPROT( gemm_ker_var2b ) + diff --git a/frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c b/frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c new file mode 100644 index 0000000000..fbfafebb0e --- /dev/null +++ b/frame/3/gemmt/attic/bli_gemmt_l_ker_var2b.c @@ -0,0 +1,429 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +typedef void (*xpbys_mxn_l_vft) + ( + doff_t diagoff, + dim_t m, + dim_t n, + void* x, inc_t rs_x, inc_t cs_x, + void* b, + void* y, inc_t rs_y, inc_t cs_y + ); + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +\ +void PASTEMAC(ch,op) \ + ( \ + doff_t diagoff, \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + void* b, \ + void* y, inc_t rs_y, inc_t cs_y \ + ) \ +{ \ + ctype* restrict x_cast = x; \ + ctype* restrict b_cast = b; \ + ctype* restrict y_cast = y; \ +\ + PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \ + ( \ + diagoff, \ + m, n, \ + x_cast, rs_x, cs_x, \ + b_cast, \ + y_cast, rs_y, cs_y \ + ); \ +} + +INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn); + +static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn); + +void bli_gemmt_l_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely above the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region above where the diagonal of C intersects + the left edge of the panel, adjust the pointer to C and A and treat + this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still negative (though its absolute value is guaranteed to be less + than MR). */ \ + if ( diagoffc < 0 ) \ + { \ + const dim_t ip = -diagoffc / MR; \ + const dim_t i = ip * MR; \ +\ + m = m - i; \ + diagoffc = diagoffc % MR; \ + c_cast = c_cast + (i )*rs_c; \ + a_cast = a_cast + (ip )*ps_a; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of C intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffc + m < n ) \ + { \ + n = diagoffc + m; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); \ + const dim_t n_left = n % NR; \ +\ + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); \ + const dim_t m_left = m % MR; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + const inc_t rstep_a = ps_a; \ +\ + const inc_t cstep_b = ps_b; \ +\ + const inc_t rstep_c = rs_c * MR; \ + const inc_t cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + const dim_t jr_inc = 1; \ + const dim_t ir_inc = 1; \ +\ + /* Determine the starting microtile offsets and number of microtiles to + compute for each thread. Note that assignment of microtiles is done + according to the tlb policy. */ \ + dim_t jr_st, ir_st; \ + const dim_t n_ut_for_me \ + = \ + bli_thread_range_tlb( thread, diagoffc, BLIS_LOWER, m, n, MR, NR, \ + &jr_st, &ir_st ); \ +\ + /* It's possible that there are so few microtiles relative to the number + of threads that one or more threads gets no work. If that happens, those + threads can return early. */ \ + if ( n_ut_for_me == 0 ) return; \ +\ + /* Start the jr/ir loops with the current thread's microtile offsets computed + by bli_thread_range_tlb(). */ \ + dim_t i = ir_st; \ + dim_t j = jr_st; \ +\ + /* Initialize a counter to track the number of microtiles computed by the + current thread. */ \ + dim_t ut = 0; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( ; true; ++j ) \ + { \ + ctype* restrict b1 = b_cast + j * cstep_b; \ + ctype* restrict c1 = c_cast + j * cstep_c; \ +\ + /* Compute the diagonal offset for the column of microtiles at (0,j). */ \ + const doff_t diagoffc_j = diagoffc - (doff_t)j*NR; \ + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) \ + ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + ctype* restrict b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( ; i < m_iter; ++i ) \ + { \ + /* Compute the diagonal offset for the microtile at (i,j). */ \ + const doff_t diagoffc_ij = diagoffc_j + (doff_t)i*MR; \ + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) \ + ? MR : m_left ); \ +\ + /* If the diagonal intersects the current MR x NR microtile, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the microtile is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we simply advance + to last microtile before the diagonal. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + MR, \ + NR, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ +\ + ut += 1; \ + if ( ut == n_ut_for_me ) return; \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_tlb_l( i, m_iter ) ) \ + { \ + a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, \ + diagoffc_j, MR, NR ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + /* We don't bother computing b2 for the last iteration of the + jr loop since the current thread won't know its j_st until + the next time it calls bli_thread_range_tlb(). */ \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ +\ + ut += 1; \ + if ( ut == n_ut_for_me ) return; \ + } \ + else /* if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) */ \ + { \ + /* Skip ahead to the last microtile strictly above the diagonal. */ \ + i = -diagoffc_j / MR - 1; \ + } \ + } \ +\ + /* Upon reaching the end of the column of microtiles, get ready to begin at + the beginning of the next column (i.e., the next jr loop iteration). */ \ + i = 0; \ + } \ +} + +INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2b ) + diff --git a/frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c b/frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c new file mode 100644 index 0000000000..311180d192 --- /dev/null +++ b/frame/3/gemmt/attic/bli_gemmt_u_ker_var2b.c @@ -0,0 +1,418 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmt_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2b); + + +void bli_gemmt_u_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + + const doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely below the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region to the left of where the diagonal of C + intersects the top edge of the panel, adjust the pointer to C and B + and treat this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still positive (though it is guaranteed to be less than NR). */ \ + if ( diagoffc > 0 ) \ + { \ + const dim_t jp = diagoffc / NR; \ + const dim_t j = jp * NR; \ +\ + n = n - j; \ + diagoffc = diagoffc % NR; \ + c_cast = c_cast + (j )*cs_c; \ + b_cast = b_cast + (jp )*ps_b; \ + } \ +\ + /* If there is a zero region below where the diagonal of C intersects + the right edge of the panel, shrink it to prevent "no-op" iterations + from executing. */ \ + if ( -diagoffc + n < m ) \ + { \ + m = -diagoffc + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); \ + const dim_t n_left = n % NR; \ +\ + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); \ + const dim_t m_left = m % MR; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + const inc_t rstep_a = ps_a; \ +\ + const inc_t cstep_b = ps_b; \ +\ + const inc_t rstep_c = rs_c * MR; \ + const inc_t cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the virtual microkernel address and the params. */ \ + /*bli_auxinfo_set_ukr( gemm_ukr, &aux );*/ \ + /*bli_auxinfo_set_params( params, &aux );*/ \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ +\ + const dim_t jr_inc = 1; \ + const dim_t ir_inc = 1; \ +\ + /* Determine the starting microtile offsets and number of microtiles to + compute for each thread. Note that assignment of microtiles is done + according to the tlb policy. */ \ + dim_t jr_st, ir_st; \ + const dim_t n_ut_for_me \ + = \ + bli_thread_range_tlb( thread, diagoffc, BLIS_UPPER, m, n, MR, NR, \ + &jr_st, &ir_st ); \ +\ + /* It's possible that there are so few microtiles relative to the number + of threads that one or more threads gets no work. If that happens, those + threads can return early. */ \ + if ( n_ut_for_me == 0 ) return; \ +\ + /* Start the jr/ir loops with the current thread's microtile offsets computed + by bli_thread_range_tlb(). */ \ + dim_t i = ir_st; \ + dim_t j = jr_st; \ +\ + /* Initialize a counter to track the number of microtiles computed by the + current thread. */ \ + dim_t ut = 0; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( ; true; ++j ) \ + { \ + ctype* restrict b1 = b_cast + j * cstep_b; \ + ctype* restrict c1 = c_cast + j * cstep_c; \ +\ + /* Compute the diagonal offset for the column of microtiles at (0,j). */ \ + const doff_t diagoffc_j = diagoffc - (doff_t)j*NR; \ + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) \ + ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + ctype* restrict b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( ; i < m_iter; ++i ) \ + { \ + /* Compute the diagonal offset for the microtile at (i,j). */ \ + const doff_t diagoffc_ij = diagoffc_j + (doff_t)i*MR; \ + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) \ + ? MR : m_left ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we simply advance + to last microtile before the bottom of the matrix. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter_tlb_u( diagoffc_ij, MR, NR ) ) \ + { \ + a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, \ + diagoffc_j, MR, NR ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + /* We don't bother computing b2 for the last iteration of the + jr loop since the current thread won't know its j_st until + the next time it calls bli_thread_range_tlb(). */ \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + MR, \ + NR, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ +\ + ut += 1; \ + if ( ut == n_ut_for_me ) return; \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ +\ + ut += 1; \ + if ( ut == n_ut_for_me ) return; \ + } \ + else /* if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) */ \ + { \ + /* Skip past the microtiles strictly below the diagonal. */ \ + i = m_iter - 1; \ + } \ + } \ +\ + i = 0; \ + } \ +} + +INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2b ) + diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c index 4a3a48304f..fd726da6f7 100644 --- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c @@ -48,7 +48,7 @@ typedef void (*xpbys_mxn_l_vft) #undef GENTFUNC #define GENTFUNC(ctype,ch,op) \ \ -void PASTEMAC(ch,op) \ +BLIS_INLINE void PASTEMAC(ch,op) \ ( \ doff_t diagoff, \ dim_t m, \ @@ -76,18 +76,19 @@ INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn); static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn); + void bli_gemmt_l_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { - const num_t dt = bli_obj_exec_dt( c ); - const dim_t dt_size = bli_dt_size( dt ); + const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt_c = bli_obj_dt( c ); doff_t diagoffc = bli_obj_diag_offset( c ); @@ -113,7 +114,7 @@ void bli_gemmt_l_ker_var2 const inc_t cs_c = bli_obj_col_stride( c ); // Detach and multiply the scalars attached to A and B. - obj_t scalar_a, scalar_b; + obj_t scalar_a, scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); @@ -123,14 +124,17 @@ void bli_gemmt_l_ker_var2 const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + const siz_t dt_size = bli_dt_size( dt_exec ); + const siz_t dt_c_size = bli_dt_size( dt_c ); + // Alias some constants to simpler names. const dim_t MR = pd_a; const dim_t NR = pd_b; // Query the context for the micro-kernel address and cast it to its // function pointer type. - gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); - xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt ]; + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt_exec ]; // Temporary C buffer for edge cases. Note that the strides of this // temporary buffer are set so that they match the storage of the @@ -138,11 +142,11 @@ void bli_gemmt_l_ker_var2 // column-stored as well. char ct[ BLIS_STACK_BUF_MAX_SIZE ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); - const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); const inc_t rs_ct = ( col_pref ? 1 : NR ); const inc_t cs_ct = ( col_pref ? MR : 1 ); - const void* zero = bli_obj_buffer_for_const( dt, &BLIS_ZERO ); + const void* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); const char* a_cast = buf_a; const char* b_cast = buf_b; char* c_cast = buf_c; @@ -175,12 +179,13 @@ void bli_gemmt_l_ker_var2 // this case as if the diagonal offset were zero. if ( diagoffc < 0 ) { - dim_t ip = -diagoffc / MR; - dim_t i = ip * MR; - m = m - i; - diagoffc = -diagoffc % MR; - c_cast = c_cast + (i )*rs_c*dt_size; - a_cast = a_cast + (ip )*ps_a*dt_size; + const dim_t ip = -diagoffc / MR; + const dim_t i = ip * MR; + + m = m - i; + diagoffc = diagoffc % MR; + c_cast = c_cast + (i )*rs_c*dt_c_size; + a_cast = a_cast + (ip )*ps_a*dt_size; } // If there is a zero region to the right of where the diagonal @@ -193,25 +198,23 @@ void bli_gemmt_l_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_c_size; + const inc_t cstep_c = cs_c * NR * dt_c_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); @@ -219,9 +222,6 @@ void bli_gemmt_l_ker_var2 bli_auxinfo_set_is_a( is_a, &aux ); bli_auxinfo_set_is_b( is_b, &aux ); - // Save the desired output datatype (indicating no typecasting). - //bli_auxinfo_set_dt_on_output( dt, &aux );*/ - // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) // loop around the microkernel. Here we query the thrinfo_t node for the // 1st (ir) loop around the microkernel. @@ -229,48 +229,21 @@ void bli_gemmt_l_ker_var2 thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t ir_nt = bli_thrinfo_n_way( caucus ); - dim_t ir_tid = bli_thrinfo_work_id( caucus ); - - dim_t jr_start, jr_end; - dim_t ir_start, ir_end; - dim_t jr_inc, ir_inc; + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - // Note that we partition the 2nd loop into two regions: the rectangular - // part of C, and the triangular portion. - dim_t n_iter_rct; - dim_t n_iter_tri; + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; - if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) - { - // If the entire panel of C does not intersect the diagonal, there is - // no triangular region, and therefore we can skip the second set of - // loops. - n_iter_rct = n_iter; - n_iter_tri = 0; - } - else - { - // If the panel of C does intersect the diagonal, compute the number of - // iterations in the rectangular region by dividing NR into the diagonal - // offset. Any remainder from this integer division is discarded, which - // is what we want. That is, we want the rectangular region to contain - // as many columns of whole microtiles as possible without including any - // microtiles that intersect the diagonal. The number of iterations in - // the triangular (or trapezoidal) region is computed as the remaining - // number of iterations in the n dimension. - n_iter_rct = diagoffc / NR; - n_iter_tri = n_iter - n_iter_rct; - } - - // Determine the thread range and increment for the 2nd and 1st loops for - // the initial rectangular region of C (if it exists). - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // Determine the thread range and increment for the 2nd and 1st loops. + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. - bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + bli_thread_range_quad( thread, diagoffc, BLIS_LOWER, m, n, NR, + FALSE, &jr_start, &jr_end, &jr_inc ); + //bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -278,7 +251,12 @@ void bli_gemmt_l_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + // Compute the diagonal offset for the column of microtiles at (0,j). + const doff_t diagoffc_j = diagoffc - ( doff_t )j*NR; + + // Compute the current microtile's width. + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -286,115 +264,34 @@ void bli_gemmt_l_ker_var2 // Interior loop over the m dimension (MR rows at a time). for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) { - const char* a1 = a_cast + i * rstep_a; - char* c11 = c1 + i * rstep_c; - - // No need to compute the diagonal offset for the rectangular - // region. - //diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ + // Compute the diagonal offset for the microtile at (i,j). + const doff_t diagoffc_ij = diagoffc_j + ( doff_t )i*MR; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + // Compute the current microtile's length. + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); - // Compute the addresses of the next panels of A and B. - const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) - { - a2 = a_cast; - b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; - } - - // Save addresses of next panels of A and B to the auxinfo_t - // object. - bli_auxinfo_set_next_a( a2, &aux ); - bli_auxinfo_set_next_b( b2, &aux ); - - // If the diagonal intersects the current MR x NR submatrix, we + // If the diagonal intersects the current MR x NR microtile, we // compute it the temporary buffer and then add in the elements // on or below the diagonal. - // Otherwise, if the submatrix is strictly below the diagonal, + // Otherwise, if the microtile is strictly below the diagonal, // we compute and store as we normally would. // And if we're strictly above the diagonal, we do nothing and - // continue. + // continue on through the IR loop to consider the next MR x NR + // microtile. + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) { - // Invoke the gemm micro-kernel. - gemm_ukr - ( - m_cur, - n_cur, - k, - ( void* )alpha_cast, - ( void* )a1, - ( void* )b1, - ( void* )beta_cast, - c11, rs_c, cs_c, - &aux, - ( cntx_t* )cntx - ); - } - } - } - - // If there is no triangular region, then we're done. - if ( n_iter_tri == 0 ) return; - - // Use round-robin assignment of micropanels to threads in the 2nd loop - // and the default (slab or rr) partitioning in the 1st loop for the - // remaining triangular region of C. - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - - // Advance the start and end iteration offsets for the triangular region - // by the number of iterations used for the rectangular region. - jr_start += n_iter_rct; - jr_end += n_iter_rct; - - // Loop over the n dimension (NR columns at a time). - for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) - { - const char* b1 = b_cast + j * cstep_b; - char* c1 = c_cast + j * cstep_c; - - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); - - // Initialize our next panel of B to be the current panel of B. - const char* b2 = b1; - - // Interior loop over the m dimension (MR rows at a time). - for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) - { - const char* a1 = a_cast + i * rstep_a; - char* c11 = c1 + i * rstep_c; + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; - // Compute the diagonal offset for the submatrix at (i,j). - doff_t diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; + // Compute the addresses of the next panel of A. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); - // Compute the addresses of the next panels of A and B. - const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) - { - a2 = a_cast; - b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; - } - - // Save addresses of next panels of A and B to the auxinfo_t - // object. - bli_auxinfo_set_next_a( a2, &aux ); - bli_auxinfo_set_next_b( b2, &aux ); - - // If the diagonal intersects the current MR x NR submatrix, we - // compute it the temporary buffer and then add in the elements - // on or below the diagonal. - // Otherwise, if the submatrix is strictly below the diagonal, - // we compute and store as we normally would. - // And if we're strictly above the diagonal, we do nothing and - // continue. - if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) - { // Invoke the gemm micro-kernel. gemm_ukr ( @@ -411,14 +308,35 @@ void bli_gemmt_l_ker_var2 ); // Scale C and add the result to only the stored part. - xpbys_mxn_l_ukr( diagoffc_ij, - m_cur, n_cur, - ct, rs_ct, cs_ct, - ( void* )beta_cast, - c11, rs_c, cs_c ); + xpbys_mxn_l_ukr + ( + diagoffc_ij, + m_cur, n_cur, + ct, rs_ct, cs_ct, + ( void* )beta_cast, + c11, rs_c, cs_c + ); } else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter_l( i, m_iter, ir_tid, ir_nt ) ) + { + a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, diagoffc_j, MR, NR ); + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + // Invoke the gemm micro-kernel. gemm_ukr ( diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2b.c b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c new file mode 100644 index 0000000000..7c50a4a540 --- /dev/null +++ b/frame/3/gemmt/bli_gemmt_l_ker_var2b.c @@ -0,0 +1,387 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +typedef void (*xpbys_mxn_l_vft) + ( + doff_t diagoff, + dim_t m, + dim_t n, + void* x, inc_t rs_x, inc_t cs_x, + void* b, + void* y, inc_t rs_y, inc_t cs_y + ); + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +\ +BLIS_INLINE void PASTEMAC(ch,op) \ + ( \ + doff_t diagoff, \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + void* b, \ + void* y, inc_t rs_y, inc_t cs_y \ + ) \ +{ \ + ctype* restrict x_cast = x; \ + ctype* restrict b_cast = b; \ + ctype* restrict y_cast = y; \ +\ + PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \ + ( \ + diagoff, \ + m, n, \ + x_cast, rs_x, cs_x, \ + b_cast, \ + y_cast, rs_y, cs_y \ + ); \ +} + +INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn); + +static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn); + + +void bli_gemmt_l_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt_c = bli_obj_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + const siz_t dt_size = bli_dt_size( dt_exec ); + const siz_t dt_c_size = bli_dt_size( dt_c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt_exec ]; + + // Temporary C buffer for edge cases. Note that the strides of this + // temporary buffer are set so that they match the storage of the + // original C matrix. For example, if C is column-stored, ct will be + // column-stored as well. + char ct[ BLIS_STACK_BUF_MAX_SIZE ] + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); + const inc_t rs_ct = ( col_pref ? 1 : NR ); + const inc_t cs_ct = ( col_pref ? MR : 1 ); + + const void* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of C is entirely above the diagonal, + // it is not stored. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; + + // If there is a zero region above where the diagonal of C intersects + // the left edge of the panel, adjust the pointer to C and A and treat + // this case as if the diagonal offset were zero. + // NOTE: It's possible that after this pruning that the diagonal offset + // is still negative (though its absolute value is guaranteed to be less + // than MR). + if ( diagoffc < 0 ) + { + const dim_t ip = -diagoffc / MR; + const dim_t i = ip * MR; + + m = m - i; + diagoffc = diagoffc % MR; + c_cast = c_cast + (i )*rs_c*dt_c_size; + a_cast = a_cast + (ip )*ps_a*dt_size; + } + + // If there is a zero region to the right of where the diagonal + // of C intersects the bottom of the panel, shrink it to prevent + // "no-op" iterations from executing. + if ( diagoffc + m < n ) + { + n = diagoffc + m; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_c_size; + const inc_t cstep_c = cs_c * NR * dt_c_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // Save the imaginary stride of A and B to the auxinfo_t object. + bli_auxinfo_set_is_a( is_a, &aux ); + bli_auxinfo_set_is_b( is_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + // Determine the starting microtile offsets and number of microtiles to + // compute for each thread. Note that assignment of microtiles is done + // according to the tlb policy. + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_l( jr_nt, jr_tid, diagoffc, m_iter, n_iter, MR, NR, + &jr_st, &ir_st ); + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + // Compute the diagonal offset for the column of microtiles at (0,j). + const doff_t diagoffc_j = diagoffc - ( doff_t )j*NR; + + // Compute the current microtile's width. + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // Interior loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + // Compute the diagonal offset for the microtile at (i,j). + const doff_t diagoffc_ij = diagoffc_j + ( doff_t )i*MR; + + // Compute the current microtile's length. + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // If the diagonal intersects the current MR x NR microtile, we + // compute it the temporary buffer and then add in the elements + // on or below the diagonal. + // Otherwise, if the microtile is strictly below the diagonal, + // we compute and store as we normally would. + // And if we're strictly above the diagonal, we simply advance + // to the last microtile before the diagonal. + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the addresses of the next panel of A. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, 1 ); + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + MR, + NR, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )zero, + ct, rs_ct, cs_ct, + &aux, + ( cntx_t* )cntx + ); + + // Scale C and add the result to only the stored part. + xpbys_mxn_l_ukr + ( + diagoffc_ij, + m_cur, n_cur, + ct, rs_ct, cs_ct, + ( void* )beta_cast, + c11, rs_c, cs_c + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; + if ( ut == n_ut_for_me ) return; + } + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_tlb_l( i, m_iter ) ) + { + a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, diagoffc_j, MR, NR ); + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; + if ( ut == n_ut_for_me ) return; + } + else // if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + // Skip ahead to the last microtile strictly above the diagonal. + i = -diagoffc_j / MR - 1; + } + } + + // Upon reaching the end of the column of microtiles, get ready to begin + // at the beginning of the next column (i.e., the next jr loop iteration). + i = 0; + } +} + diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c index 5b4e1ccd96..78d5b869d2 100644 --- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c @@ -48,7 +48,7 @@ typedef void (*xpbys_mxn_u_vft) #undef GENTFUNC #define GENTFUNC(ctype,ch,op) \ \ -void PASTEMAC(ch,op) \ +BLIS_INLINE void PASTEMAC(ch,op) \ ( \ doff_t diagoff, \ dim_t m, \ @@ -76,18 +76,19 @@ INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn); static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn); + void bli_gemmt_u_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { - const num_t dt = bli_obj_exec_dt( c ); - const dim_t dt_size = bli_dt_size( dt ); + const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt_c = bli_obj_dt( c ); doff_t diagoffc = bli_obj_diag_offset( c ); @@ -113,7 +114,7 @@ void bli_gemmt_u_ker_var2 const inc_t cs_c = bli_obj_col_stride( c ); // Detach and multiply the scalars attached to A and B. - obj_t scalar_a, scalar_b; + obj_t scalar_a, scalar_b; bli_obj_scalar_detach( a, &scalar_a ); bli_obj_scalar_detach( b, &scalar_b ); bli_mulsc( &scalar_a, &scalar_b ); @@ -123,14 +124,17 @@ void bli_gemmt_u_ker_var2 const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + const siz_t dt_size = bli_dt_size( dt_exec ); + const siz_t dt_c_size = bli_dt_size( dt_c ); + // Alias some constants to simpler names. const dim_t MR = pd_a; const dim_t NR = pd_b; // Query the context for the micro-kernel address and cast it to its // function pointer type. - gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); - xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt ]; + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt_exec ]; // Temporary C buffer for edge cases. Note that the strides of this // temporary buffer are set so that they match the storage of the @@ -138,11 +142,11 @@ void bli_gemmt_u_ker_var2 // column-stored as well. char ct[ BLIS_STACK_BUF_MAX_SIZE ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); - const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); const inc_t rs_ct = ( col_pref ? 1 : NR ); const inc_t cs_ct = ( col_pref ? MR : 1 ); - const void* zero = bli_obj_buffer_for_const( dt, &BLIS_ZERO ); + const void* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); const char* a_cast = buf_a; const char* b_cast = buf_b; char* c_cast = buf_c; @@ -177,12 +181,13 @@ void bli_gemmt_u_ker_var2 // is still positive (though it is guaranteed to be less than NR). if ( diagoffc > 0 ) { - dim_t jp = diagoffc / NR; - dim_t j = jp * NR; - n = n - j; - diagoffc = diagoffc % NR; - c_cast = c_cast + (j )*cs_c*dt_size; - b_cast = b_cast + (jp )*ps_b*dt_size; + const dim_t jp = diagoffc / NR; + const dim_t j = jp * NR; + + n = n - j; + diagoffc = diagoffc % NR; + c_cast = c_cast + (j )*cs_c*dt_c_size; + b_cast = b_cast + (jp )*ps_b*dt_size; } // If there is a zero region below where the diagonal of C intersects @@ -195,25 +200,23 @@ void bli_gemmt_u_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_c_size; + const inc_t cstep_c = cs_c * NR * dt_c_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); @@ -221,9 +224,6 @@ void bli_gemmt_u_ker_var2 bli_auxinfo_set_is_a( is_a, &aux ); bli_auxinfo_set_is_b( is_b, &aux ); - // Save the desired output datatype (indicating no typecasting). - //bli_auxinfo_set_dt_on_output( dt, &aux );*/ - // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) // loop around the microkernel. Here we query the thrinfo_t node for the // 1st (ir) loop around the microkernel. @@ -231,47 +231,21 @@ void bli_gemmt_u_ker_var2 thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t ir_nt = bli_thrinfo_n_way( caucus ); - dim_t ir_tid = bli_thrinfo_work_id( caucus ); - - dim_t jr_start, jr_end; - dim_t ir_start, ir_end; - dim_t jr_inc, ir_inc; + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - // Note that we partition the 2nd loop into two regions: the triangular - // part of C, and the rectangular portion. - dim_t n_iter_tri; - dim_t n_iter_rct; + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; - if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) - { - // If the entire panel of C does not intersect the diagonal, there is - // no triangular region, and therefore we can skip the first set of - // loops. - n_iter_tri = 0; - n_iter_rct = n_iter; - } - else - { - // If the panel of C does intersect the diagonal, compute the number of - // iterations in the triangular (or trapezoidal) region by dividing NR - // into the number of rows in C. A non-zero remainder means we need to - // add one additional iteration. That is, we want the triangular region - // to contain as few columns of whole microtiles as possible while still - // including all microtiles that intersect the diagonal. The number of - // iterations in the rectangular region is computed as the remaining - // number of iterations in the n dimension. - n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); - n_iter_rct = n_iter - n_iter_tri; - } - - // Use round-robin assignment of micropanels to threads in the 2nd loop - // and the default (slab or rr) partitioning in the 1st loop for the - // initial triangular region of C (if it exists). - bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - bli_thread_range_jrir ( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + // Determine the thread range and increment for the 2nd and 1st loops. + // NOTE: The definition of bli_thread_range_slrr() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + bli_thread_range_quad( thread, diagoffc, BLIS_UPPER, m, n, NR, + FALSE, &jr_start, &jr_end, &jr_inc ); + //bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -279,7 +253,12 @@ void bli_gemmt_u_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + // Compute the diagonal offset for the column of microtiles at (0,j). + const doff_t diagoffc_j = diagoffc - ( doff_t )j*NR; + + // Compute the current microtile's width. + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -287,38 +266,41 @@ void bli_gemmt_u_ker_var2 // Interior loop over the m dimension (MR rows at a time). for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) { - const char* a1 = a_cast + i * rstep_a; - char* c11 = c1 + i * rstep_c; - - // Compute the diagonal offset for the submatrix at (i,j). - doff_t diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; + // Compute the diagonal offset for the microtile at (i,j). + const doff_t diagoffc_ij = diagoffc_j + ( doff_t )i*MR; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + // Compute the current microtile's length. + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); - // Compute the addresses of the next panels of A and B. - const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) - { - a2 = a_cast; - b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; - } - - // Save addresses of next panels of A and B to the auxinfo_t - // object. - bli_auxinfo_set_next_a( a2, &aux ); - bli_auxinfo_set_next_b( b2, &aux ); - - // If the diagonal intersects the current MR x NR submatrix, we + // If the diagonal intersects the current MR x NR microtile, we // compute it the temporary buffer and then add in the elements // on or below the diagonal. - // Otherwise, if the submatrix is strictly above the diagonal, + // Otherwise, if the microtile is strictly above the diagonal, // we compute and store as we normally would. // And if we're strictly below the diagonal, we do nothing and - // continue. + // continue on through the IR loop to consider the next MR x NR + // microtile. if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter_u( diagoffc_ij, MR, NR, ir_inc ) ) + { + a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, diagoffc_j, MR, NR ); + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + // Invoke the gemm micro-kernel. gemm_ukr ( @@ -335,93 +317,28 @@ void bli_gemmt_u_ker_var2 ); // Scale C and add the result to only the stored part. - xpbys_mxn_u_ukr( diagoffc_ij, - m_cur, n_cur, - ct, rs_ct, cs_ct, - ( void* )beta_cast, - c11, rs_c, cs_c ); - } - else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) - { - // Invoke the gemm micro-kernel. - gemm_ukr + xpbys_mxn_u_ukr ( - m_cur, - n_cur, - k, - ( void* )alpha_cast, - ( void* )a1, - ( void* )b1, + diagoffc_ij, + m_cur, n_cur, + ct, rs_ct, cs_ct, ( void* )beta_cast, - c11, rs_c, cs_c, - &aux, - ( cntx_t* )cntx + c11, rs_c, cs_c ); } - } - } - - // If there is no rectangular region, then we're done. - if ( n_iter_rct == 0 ) return; - - // Determine the thread range and increment for the 2nd loop of the - // remaining rectangular region of C (and also use default partitioning - // for the 1st loop). - // NOTE: The definition of bli_thread_range_jrir() will depend on whether - // slab or round-robin partitioning was requested at configure-time. - bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - - // Advance the start and end iteration offsets for the rectangular region - // by the number of iterations used for the triangular region. - jr_start += n_iter_tri; - jr_end += n_iter_tri; - - // Loop over the n dimension (NR columns at a time). - for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) - { - const char* b1 = b_cast + j * cstep_b; - char* c1 = c_cast + j * cstep_c; - - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); - - // Initialize our next panel of B to be the current panel of B. - const char* b2 = b1; - - // Interior loop over the m dimension (MR rows at a time). - for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) - { - const char* a1 = a_cast + i * rstep_a; - char* c11 = c1 + i * rstep_c; - - // No need to compute the diagonal offset for the rectangular - // region. - //diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ - - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); - - // Compute the addresses of the next panels of A and B. - const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) { - a2 = a_cast; - b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; - } + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; - // Save addresses of next panels of A and B to the auxinfo_t - // object. - bli_auxinfo_set_next_a( a2, &aux ); - bli_auxinfo_set_next_b( b2, &aux ); + // Compute the addresses of the next panel of A. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); - // If the diagonal intersects the current MR x NR submatrix, we - // compute it the temporary buffer and then add in the elements - // on or below the diagonal. - // Otherwise, if the submatrix is strictly above the diagonal, - // we compute and store as we normally would. - // And if we're strictly below the diagonal, we do nothing and - // continue. - { // Invoke the gemm micro-kernel. gemm_ukr ( diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2b.c b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c new file mode 100644 index 0000000000..91275577a4 --- /dev/null +++ b/frame/3/gemmt/bli_gemmt_u_ker_var2b.c @@ -0,0 +1,386 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +typedef void (*xpbys_mxn_u_vft) + ( + doff_t diagoff, + dim_t m, + dim_t n, + void* x, inc_t rs_x, inc_t cs_x, + void* b, + void* y, inc_t rs_y, inc_t cs_y + ); + +#undef GENTFUNC +#define GENTFUNC(ctype,ch,op) \ +\ +BLIS_INLINE void PASTEMAC(ch,op) \ + ( \ + doff_t diagoff, \ + dim_t m, \ + dim_t n, \ + void* x, inc_t rs_x, inc_t cs_x, \ + void* b, \ + void* y, inc_t rs_y, inc_t cs_y \ + ) \ +{ \ + ctype* restrict x_cast = x; \ + ctype* restrict b_cast = b; \ + ctype* restrict y_cast = y; \ +\ + PASTEMAC3(ch,ch,ch,xpbys_mxn_u) \ + ( \ + diagoff, \ + m, n, \ + x_cast, rs_x, cs_x, \ + b_cast, \ + y_cast, rs_y, cs_y \ + ); \ +} + +INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn); + +static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn); + + +void bli_gemmt_u_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + const num_t dt_c = bli_obj_dt( c ); + + doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + const siz_t dt_size = bli_dt_size( dt_exec ); + const siz_t dt_c_size = bli_dt_size( dt_c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt_exec, BLIS_GEMM_UKR, cntx ); + xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt_exec ]; + + // Temporary C buffer for edge cases. Note that the strides of this + // temporary buffer are set so that they match the storage of the + // original C matrix. For example, if C is column-stored, ct will be + // column-stored as well. + char ct[ BLIS_STACK_BUF_MAX_SIZE ] + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx ); + const inc_t rs_ct = ( col_pref ? 1 : NR ); + const inc_t cs_ct = ( col_pref ? MR : 1 ); + + const void* zero = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of C is entirely below the diagonal, + // it is not stored. So we do nothing. + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; + + // If there is a zero region to the left of where the diagonal of C + // intersects the top edge of the panel, adjust the pointer to C and B + // and treat this case as if the diagonal offset were zero. + // NOTE: It's possible that after this pruning that the diagonal offset + // is still positive (though it is guaranteed to be less than NR). + if ( diagoffc > 0 ) + { + const dim_t jp = diagoffc / NR; + const dim_t j = jp * NR; + + n = n - j; + diagoffc = diagoffc % NR; + c_cast = c_cast + (j )*cs_c*dt_c_size; + b_cast = b_cast + (jp )*ps_b*dt_size; + } + + // If there is a zero region below where the diagonal of C intersects + // the right edge of the panel, shrink it to prevent "no-op" iterations + // from executing. + if ( -diagoffc + n < m ) + { + m = -diagoffc + n; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_c_size; + const inc_t cstep_c = cs_c * NR * dt_c_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // Save the imaginary stride of A and B to the auxinfo_t object. + bli_auxinfo_set_is_a( is_a, &aux ); + bli_auxinfo_set_is_b( is_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + // Determine the starting microtile offsets and number of microtiles to + // compute for each thread. Note that assignment of microtiles is done + // according to the tlb policy. + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_u( jr_nt, jr_tid, diagoffc, m_iter, n_iter, MR, NR, + &jr_st, &ir_st ); + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + // Compute the diagonal offset for the column of microtiles at (0,j). + const doff_t diagoffc_j = diagoffc - ( doff_t )j*NR; + + // Compute the current microtile's width. + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // Interior loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + // Compute the diagonal offset for the microtile at (i,j). + const doff_t diagoffc_ij = diagoffc_j + ( doff_t )i*MR; + + // Compute the current microtile's length. + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // If the diagonal intersects the current MR x NR microtile, we + // compute it the temporary buffer and then add in the elements + // on or below the diagonal. + // Otherwise, if the microtile is strictly above the diagonal, + // we compute and store as we normally would. + // And if we're strictly below the diagonal, we simply advance + // to last microtile before the bottom of the matrix. + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_tlb_u( diagoffc_ij, MR, NR ) ) + { + a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, diagoffc_j, MR, NR ); + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + MR, + NR, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )zero, + ct, rs_ct, cs_ct, + &aux, + ( cntx_t* )cntx + ); + + // Scale C and add the result to only the stored part. + xpbys_mxn_u_ukr + ( + diagoffc_ij, + m_cur, n_cur, + ct, rs_ct, cs_ct, + ( void* )beta_cast, + c11, rs_c, cs_c + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; + if ( ut == n_ut_for_me ) return; + } + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + // Compute the addresses of the next panel of A. + const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, 1 ); + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; + if ( ut == n_ut_for_me ) return; + } + else // if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) + { + // Skip past the microtiles strictly below the diagonal. + i = m_iter - 1; + } + } + + // Upon reaching the end of the column of microtiles, get ready to begin + // at the beginning of the next column (i.e., the next jr loop iteration). + i = 0; + } +} + diff --git a/frame/3/gemmt/bli_gemmt_var.h b/frame/3/gemmt/bli_gemmt_var.h index eb6e160180..339b937555 100644 --- a/frame/3/gemmt/bli_gemmt_var.h +++ b/frame/3/gemmt/bli_gemmt_var.h @@ -43,46 +43,19 @@ \ void PASTEMAC0(opname) \ ( \ - const obj_t* a, \ - const obj_t* ah, \ - const obj_t* c, \ - const cntx_t* cntx, \ - const cntl_t* cntl, \ - thrinfo_t* thread \ + const obj_t* a, \ + const obj_t* ah, \ + const obj_t* c, \ + const cntx_t* cntx, \ + const cntl_t* cntl, \ + thrinfo_t* thread_par \ ); GENPROT( gemmt_x_ker_var2 ) - GENPROT( gemmt_l_ker_var2 ) GENPROT( gemmt_u_ker_var2 ) - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoffc, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, inc_t is_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, inc_t is_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( gemmt_l_ker_var2 ) -INSERT_GENTPROT_BASIC0( gemmt_u_ker_var2 ) +GENPROT( gemmt_x_ker_var2b ) +GENPROT( gemmt_l_ker_var2b ) +GENPROT( gemmt_u_ker_var2b ) diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c index 207e1c938f..8081537b91 100644 --- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c @@ -42,12 +42,12 @@ static l3_var_oft vars[2] = void bli_gemmt_x_ker_var2 ( - const obj_t* a, - const obj_t* ah, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* ah, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { dim_t uplo; @@ -67,7 +67,7 @@ void bli_gemmt_x_ker_var2 c, cntx, cntl, - thread + thread_par ); } diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2b.c b/frame/3/gemmt/bli_gemmt_x_ker_var2b.c new file mode 100644 index 0000000000..132d7c13a9 --- /dev/null +++ b/frame/3/gemmt/bli_gemmt_x_ker_var2b.c @@ -0,0 +1,73 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static l3_var_oft vars[2] = +{ + bli_gemmt_l_ker_var2b, bli_gemmt_u_ker_var2b, +}; + +void bli_gemmt_x_ker_var2b + ( + const obj_t* a, + const obj_t* ah, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + dim_t uplo; + + // Set a bool based on the uplo field of C's root object. + if ( bli_obj_root_is_lower( c ) ) uplo = 0; + else uplo = 1; + + // Index into the variant array to extract the correct function pointer. + l3_var_oft f = vars[uplo]; + + // Call the macrokernel. + f + ( + a, + ah, + c, + cntx, + cntl, + thread_par + ); +} + diff --git a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c.prev b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c.prev new file mode 100644 index 0000000000..aed0359ecb --- /dev/null +++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c.prev @@ -0,0 +1,507 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmt_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2); + + +void bli_gemmt_l_ker_var2 + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + + const doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, ip; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely above the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region above where the diagonal of C intersects + the left edge of the panel, adjust the pointer to C and A and treat + this case as if the diagonal offset were zero. */ \ + if ( diagoffc < 0 ) \ + { \ + ip = -diagoffc / MR; \ + i = ip * MR; \ + m = m - i; \ + diagoffc = -diagoffc % MR; \ + c_cast = c_cast + (i )*rs_c; \ + a_cast = a_cast + (ip )*ps_a; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of C intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffc + m < n ) \ + { \ + n = diagoffc + m; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the rectangular + part of C, and the triangular portion. */ \ + dim_t n_iter_rct; \ + dim_t n_iter_tri; \ +\ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \ + { \ + /* If the entire panel of C does not intersect the diagonal, there is + no triangular region, and therefore we can skip the second set of + loops. */ \ + n_iter_rct = n_iter; \ + n_iter_tri = 0; \ + } \ + else \ + { \ + /* If the panel of C does intersect the diagonal, compute the number of + iterations in the rectangular region by dividing NR into the diagonal + offset. Any remainder from this integer division is discarded, which + is what we want. That is, we want the rectangular region to contain + as many columns of whole microtiles as possible without including any + microtiles that intersect the diagonal. The number of iterations in + the triangular (or trapezoidal) region is computed as the remaining + number of iterations in the n dimension. */ \ + n_iter_rct = diagoffc / NR; \ + n_iter_tri = n_iter - n_iter_rct; \ + } \ +\ + /* Determine the thread range and increment for the 2nd and 1st loops for + the initial rectangular region of C (if it exists). + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* No need to compute the diagonal offset for the rectangular + region. */ \ + /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +\ + /* If there is no triangular region, then we're done. */ \ + if ( n_iter_tri == 0 ) return; \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop + and the default (slab or rr) partitioning in the 1st loop for the + remaining triangular region of C. */ \ + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Advance the start and end iteration offsets for the triangular region + by the number of iterations used for the rectangular region. */ \ + jr_start += n_iter_rct; \ + jr_end += n_iter_rct; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + MR, \ + NR, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 ) + diff --git a/frame/3/gemmt/other/bli_gemmt_l_ker_var2b.c.before b/frame/3/gemmt/other/bli_gemmt_l_ker_var2b.c.before new file mode 100644 index 0000000000..4285bd1356 --- /dev/null +++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2b.c.before @@ -0,0 +1,427 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmt_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2b); + + +void bli_gemmt_l_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + + const doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely above the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region above where the diagonal of C intersects + the left edge of the panel, adjust the pointer to C and A and treat + this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still negative (though its absolute value is guaranteed to be less + than MR). */ \ + if ( diagoffc < 0 ) \ + { \ + const dim_t ip = -diagoffc / MR; \ + const dim_t i = ip * MR; \ +\ + m = m - i; \ + diagoffc = diagoffc % MR; \ + c_cast = c_cast + (i )*rs_c; \ + a_cast = a_cast + (ip )*ps_a; \ + } \ +\ + /* If there is a zero region to the right of where the diagonal + of C intersects the bottom of the panel, shrink it to prevent + "no-op" iterations from executing. */ \ + if ( diagoffc + m < n ) \ + { \ + n = diagoffc + m; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); \ + const dim_t n_left = n % NR; \ +\ + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); \ + const dim_t m_left = m % MR; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + const inc_t rstep_a = ps_a; \ +\ + const inc_t cstep_b = ps_b; \ +\ + const inc_t rstep_c = rs_c * MR; \ + const inc_t cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the virtual microkernel address and the params. */ \ + /*bli_auxinfo_set_ukr( gemm_ukr, &aux );*/ \ + /*bli_auxinfo_set_params( params, &aux );*/ \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + const dim_t jr_nt = bli_thread_n_way( thread ); \ + const dim_t jr_tid = bli_thread_work_id( thread ); \ + const dim_t ir_nt = bli_thread_n_way( caucus ); \ + const dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for the 2nd and 1st loops. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ +/* +*/ \ + bli_thread_range_weighted_jr( thread, diagoffc, BLIS_LOWER, m, n, NR, \ + FALSE, &jr_start, &jr_end, &jr_inc ); \ + /*bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );*/ \ +/* +*/ \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +/* + dim_t jr_st, ir_st; \ + const dim_t n_ut_for_me \ + = \ + bli_thread_range_tlb( thread, diagoffc, BLIS_LOWER, m, n, MR, NR, \ + &jr_st, &ir_st ); \ +*/ \ +\ +/* +printf( "bli_gemmt_l_ker_var2b(): tid %d: m n = %d %d st en in = %3d %3d %3d do %d\n", (int)jr_tid, (int)m, (int)n, (int)jr_start, (int)jr_end, (int)jr_inc, (int)diagoffc ); \ +*/ \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict b1 = b_cast + j * cstep_b; \ + ctype* restrict c1 = c_cast + j * cstep_c; \ +\ + /* Compute the diagonal offset for the column of microtiles at (0,j). */ \ + const doff_t diagoffc_j = diagoffc - (doff_t)j*NR; \ + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) \ + ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + ctype* restrict b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + /* Compute the diagonal offset for the microtile at (i,j). */ \ + const doff_t diagoffc_ij = diagoffc_j + (doff_t)i*MR; \ + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) \ + ? MR : m_left ); \ +\ + /* If the diagonal intersects the current MR x NR microtile, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the microtile is strictly below the diagonal, + we compute and store as we normally would. + And if we're strictly above the diagonal, we do nothing and + continue on through the IR loop to consider the next MR x NR + microtile. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, \ + diagoffc_j, MR, NR ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + MR, \ + NR, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = bli_gemmt_l_wrap_a_upanel( a_cast, rstep_a, \ + diagoffc_j, MR, NR ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2b ) + diff --git a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c.prev b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c.prev new file mode 100644 index 0000000000..87d77ee554 --- /dev/null +++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c.prev @@ -0,0 +1,510 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmt_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2); + + +void bli_gemmt_u_ker_var2 + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + + const doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ + ctype* restrict b1; \ + ctype* restrict c1; \ +\ + doff_t diagoffc_ij; \ + dim_t m_iter, m_left; \ + dim_t n_iter, n_left; \ + dim_t m_cur; \ + dim_t n_cur; \ + dim_t i, j, jp; \ + inc_t rstep_a; \ + inc_t cstep_b; \ + inc_t rstep_c, cstep_c; \ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely below the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region to the left of where the diagonal of C + intersects the top edge of the panel, adjust the pointer to C and B + and treat this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still positive (though it is guaranteed to be less than NR). */ \ + if ( diagoffc > 0 ) \ + { \ + jp = diagoffc / NR; \ + j = jp * NR; \ + n = n - j; \ + diagoffc = diagoffc % NR; \ + c_cast = c_cast + (j )*cs_c; \ + b_cast = b_cast + (jp )*ps_b; \ + } \ +\ + /* If there is a zero region below where the diagonal of C intersects + the right edge of the panel, shrink it to prevent "no-op" iterations + from executing. */ \ + if ( -diagoffc + n < m ) \ + { \ + m = -diagoffc + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + n_iter = n / NR; \ + n_left = n % NR; \ +\ + m_iter = m / MR; \ + m_left = m % MR; \ +\ + if ( n_left ) ++n_iter; \ + if ( m_left ) ++m_iter; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + rstep_a = ps_a; \ +\ + cstep_b = ps_b; \ +\ + rstep_c = rs_c * MR; \ + cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Note that we partition the 2nd loop into two regions: the triangular + part of C, and the rectangular portion. */ \ + dim_t n_iter_tri; \ + dim_t n_iter_rct; \ +\ + if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \ + { \ + /* If the entire panel of C does not intersect the diagonal, there is + no triangular region, and therefore we can skip the first set of + loops. */ \ + n_iter_tri = 0; \ + n_iter_rct = n_iter; \ + } \ + else \ + { \ + /* If the panel of C does intersect the diagonal, compute the number of + iterations in the triangular (or trapezoidal) region by dividing NR + into the number of rows in C. A non-zero remainder means we need to + add one additional iteration. That is, we want the triangular region + to contain as few columns of whole microtiles as possible while still + including all microtiles that intersect the diagonal. The number of + iterations in the rectangular region is computed as the remaining + number of iterations in the n dimension. */ \ + n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \ + n_iter_rct = n_iter - n_iter_tri; \ + } \ +\ + /* Use round-robin assignment of micropanels to threads in the 2nd loop + and the default (slab or rr) partitioning in the 1st loop for the + initial triangular region of C (if it exists). */ \ + bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ + bli_thread_range_jrir ( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* Compute the diagonal offset for the submatrix at (i,j). */ \ + diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + MR, \ + NR, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +\ + /* If there is no rectangular region, then we're done. */ \ + if ( n_iter_rct == 0 ) return; \ +\ + /* Determine the thread range and increment for the 2nd loop of the + remaining rectangular region of C (and also use default partitioning + for the 1st loop). + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \ +\ + /* Advance the start and end iteration offsets for the rectangular region + by the number of iterations used for the triangular region. */ \ + jr_start += n_iter_tri; \ + jr_end += n_iter_tri; \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict a1; \ + ctype* restrict c11; \ + ctype* restrict b2; \ +\ + b1 = b_cast + j * cstep_b; \ + c1 = c_cast + j * cstep_c; \ +\ + n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + ctype* restrict a2; \ +\ + a1 = a_cast + i * rstep_a; \ + c11 = c1 + i * rstep_c; \ +\ + /* No need to compute the diagonal offset for the rectangular + region. */ \ + /*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \ +\ + m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = a_cast; \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue. */ \ + { \ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 ) + diff --git a/frame/3/gemmt/other/bli_gemmt_u_ker_var2b.c.before b/frame/3/gemmt/other/bli_gemmt_u_ker_var2b.c.before new file mode 100644 index 0000000000..dbf8f389f1 --- /dev/null +++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2b.c.before @@ -0,0 +1,415 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#define FUNCPTR_T gemmt_fp + +typedef void (*FUNCPTR_T) + ( + doff_t diagoffc, + pack_t schema_a, + pack_t schema_b, + dim_t m, + dim_t n, + dim_t k, + void* alpha, + void* a, inc_t cs_a, inc_t is_a, + dim_t pd_a, inc_t ps_a, + void* b, inc_t rs_b, inc_t is_b, + dim_t pd_b, inc_t ps_b, + void* beta, + void* c, inc_t rs_c, inc_t cs_c, + cntx_t* cntx, + rntm_t* rntm, + thrinfo_t* thread + ); + +static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2b); + + +void bli_gemmt_u_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + rntm_t* rntm, + cntl_t* cntl, + thrinfo_t* thread + ) +{ + const num_t dt_exec = bli_obj_exec_dt( c ); + + const doff_t diagoffc = bli_obj_diag_offset( c ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + const dim_t m = bli_obj_length( c ); + const dim_t n = bli_obj_width( c ); + const dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const inc_t is_a = bli_obj_imag_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const inc_t is_b = bli_obj_imag_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Index into the type combination array to extract the correct + // function pointer. + ftypes[dt_exec] + ( + diagoffc, + schema_a, + schema_b, + m, + n, + k, + ( void* )buf_alpha, + ( void* )buf_a, cs_a, is_a, + pd_a, ps_a, + ( void* )buf_b, rs_b, is_b, + pd_b, ps_b, + ( void* )buf_beta, + buf_c, rs_c, cs_c, + ( cntx_t* )cntx, + rntm, + thread + ); +} + + +#undef GENTFUNC +#define GENTFUNC( ctype, ch, varname ) \ +\ +void PASTEMAC(ch,varname) \ + ( \ + doff_t diagoffc, \ + pack_t schema_a, \ + pack_t schema_b, \ + dim_t m, \ + dim_t n, \ + dim_t k, \ + void* alpha, \ + void* a, inc_t cs_a, inc_t is_a, \ + dim_t pd_a, inc_t ps_a, \ + void* b, inc_t rs_b, inc_t is_b, \ + dim_t pd_b, inc_t ps_b, \ + void* beta, \ + void* c, inc_t rs_c, inc_t cs_c, \ + cntx_t* cntx, \ + rntm_t* rntm, \ + thrinfo_t* thread \ + ) \ +{ \ + const num_t dt = PASTEMAC(ch,type); \ +\ + /* Alias some constants to simpler names. */ \ + const dim_t MR = pd_a; \ + const dim_t NR = pd_b; \ + /*const dim_t PACKMR = cs_a;*/ \ + /*const dim_t PACKNR = rs_b;*/ \ +\ + /* Query the context for the micro-kernel address and cast it to its + function pointer type. */ \ + PASTECH(ch,gemm_ukr_ft) \ + gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \ +\ + /* Temporary C buffer for edge cases. Note that the strides of this + temporary buffer are set so that they match the storage of the + original C matrix. For example, if C is column-stored, ct will be + column-stored as well. */ \ + ctype ct[ BLIS_STACK_BUF_MAX_SIZE \ + / sizeof( ctype ) ] \ + __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \ + const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \ + const inc_t rs_ct = ( col_pref ? 1 : NR ); \ + const inc_t cs_ct = ( col_pref ? MR : 1 ); \ +\ + ctype* restrict zero = PASTEMAC(ch,0); \ + ctype* restrict a_cast = a; \ + ctype* restrict b_cast = b; \ + ctype* restrict c_cast = c; \ + ctype* restrict alpha_cast = alpha; \ + ctype* restrict beta_cast = beta; \ +\ + auxinfo_t aux; \ +\ + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ \ +\ + /* If any dimension is zero, return immediately. */ \ + if ( bli_zero_dim3( m, n, k ) ) return; \ +\ + /* Safeguard: If the current panel of C is entirely below the diagonal, + it is not stored. So we do nothing. */ \ + if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \ +\ + /* If there is a zero region to the left of where the diagonal of C + intersects the top edge of the panel, adjust the pointer to C and B + and treat this case as if the diagonal offset were zero. + NOTE: It's possible that after this pruning that the diagonal offset + is still positive (though it is guaranteed to be less than NR). */ \ + if ( diagoffc > 0 ) \ + { \ + const dim_t jp = diagoffc / NR; \ + const dim_t j = jp * NR; \ +\ + n = n - j; \ + diagoffc = diagoffc % NR; \ + c_cast = c_cast + (j )*cs_c; \ + b_cast = b_cast + (jp )*ps_b; \ + } \ +\ + /* If there is a zero region below where the diagonal of C intersects + the right edge of the panel, shrink it to prevent "no-op" iterations + from executing. */ \ + if ( -diagoffc + n < m ) \ + { \ + m = -diagoffc + n; \ + } \ +\ + /* Clear the temporary C buffer in case it has any infs or NaNs. */ \ + PASTEMAC(ch,set0s_mxn)( MR, NR, \ + ct, rs_ct, cs_ct ); \ +\ + /* Compute number of primary and leftover components of the m and n + dimensions. */ \ + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); \ + const dim_t n_left = n % NR; \ +\ + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); \ + const dim_t m_left = m % MR; \ +\ + /* Determine some increments used to step through A, B, and C. */ \ + const inc_t rstep_a = ps_a; \ +\ + const inc_t cstep_b = ps_b; \ +\ + const inc_t rstep_c = rs_c * MR; \ + const inc_t cstep_c = cs_c * NR; \ +\ + /* Save the pack schemas of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_schema_a( schema_a, &aux ); \ + bli_auxinfo_set_schema_b( schema_b, &aux ); \ +\ + /* Save the imaginary stride of A and B to the auxinfo_t object. */ \ + bli_auxinfo_set_is_a( is_a, &aux ); \ + bli_auxinfo_set_is_b( is_b, &aux ); \ +\ + /* Save the virtual microkernel address and the params. */ \ + /*bli_auxinfo_set_ukr( gemm_ukr, &aux );*/ \ + /*bli_auxinfo_set_params( params, &aux );*/ \ +\ + /* Save the desired output datatype (indicating no typecasting). */ \ + /*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \ +\ + /* The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + loop around the microkernel. Here we query the thrinfo_t node for the + 1st (ir) loop around the microkernel. */ \ + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \ +\ + /* Query the number of threads and thread ids for each loop. */ \ + dim_t jr_nt = bli_thread_n_way( thread ); \ + dim_t jr_tid = bli_thread_work_id( thread ); \ + dim_t ir_nt = bli_thread_n_way( caucus ); \ + dim_t ir_tid = bli_thread_work_id( caucus ); \ +\ + dim_t jr_start, jr_end; \ + dim_t ir_start, ir_end; \ + dim_t jr_inc, ir_inc; \ +\ + /* Determine the thread range and increment for the 2nd and 1st loops. + NOTE: The definition of bli_thread_range_jrir() will depend on whether + slab or round-robin partitioning was requested at configure-time. */ \ + bli_thread_range_weighted_jr( thread, diagoffc, BLIS_UPPER, m, n, NR, \ + FALSE, &jr_start, &jr_end, &jr_inc ); \ + /*bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );*/ \ + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); \ +\ +/* +printf( "bli_gemmt_u_ker_var2b(): tid %d: m n = %d %d st en in = %3d %3d %3d do %d\n", (int)jr_tid, (int)m, (int)n, (int)jr_start, (int)jr_end, (int)jr_inc, (int)diagoffc ); \ +*/ \ +\ + /* Loop over the n dimension (NR columns at a time). */ \ + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) \ + { \ + ctype* restrict b1 = b_cast + j * cstep_b; \ + ctype* restrict c1 = c_cast + j * cstep_c; \ +\ + /* Compute the diagonal offset for the column of microtiles at (0,j). */ \ + const doff_t diagoffc_j = diagoffc - (doff_t)j*NR; \ + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) \ + ? NR : n_left ); \ +\ + /* Initialize our next panel of B to be the current panel of B. */ \ + ctype* restrict b2 = b1; \ +\ + /* Interior loop over the m dimension (MR rows at a time). */ \ + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) \ + { \ + /* Compute the diagonal offset for the microtile at (i,j). */ \ + const doff_t diagoffc_ij = diagoffc_j + (doff_t)i*MR; \ + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) \ + ? MR : m_left ); \ +\ + /* If the diagonal intersects the current MR x NR submatrix, we + compute it the temporary buffer and then add in the elements + on or below the diagonal. + Otherwise, if the submatrix is strictly above the diagonal, + we compute and store as we normally would. + And if we're strictly below the diagonal, we do nothing and + continue on through the IR loop to consider the next MR x NR + microtile. */ \ + if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, \ + diagoffc_j, MR, NR ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + MR, \ + NR, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + zero, \ + ct, rs_ct, cs_ct, \ + &aux, \ + cntx \ + ); \ +\ + /* Scale C and add the result to only the stored part. */ \ + PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \ + m_cur, n_cur, \ + ct, rs_ct, cs_ct, \ + beta_cast, \ + c11, rs_c, cs_c ); \ + } \ + else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \ + { \ + ctype* restrict a1 = a_cast + i * rstep_a; \ + ctype* restrict c11 = c1 + i * rstep_c; \ +\ + /* Compute the addresses of the next panels of A and B. */ \ + ctype* restrict a2 \ + = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \ + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \ + { \ + a2 = bli_gemmt_u_wrap_a_upanel( a_cast, rstep_a, \ + diagoffc_j, MR, NR ); \ + b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \ + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \ + b2 = b_cast; \ + } \ +\ + /* Save addresses of next panels of A and B to the auxinfo_t + object. */ \ + bli_auxinfo_set_next_a( a2, &aux ); \ + bli_auxinfo_set_next_b( b2, &aux ); \ +\ + /* Invoke the gemm micro-kernel. */ \ + gemm_ukr \ + ( \ + m_cur, \ + n_cur, \ + k, \ + alpha_cast, \ + a1, \ + b1, \ + beta_cast, \ + c11, rs_c, cs_c, \ + &aux, \ + cntx \ + ); \ + } \ + } \ + } \ +} + +INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2b ) + diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c index 3bc4e3c6b4..0c5cde72c1 100644 --- a/frame/3/trmm/bli_trmm_ll_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c @@ -37,11 +37,11 @@ void bli_trmm_ll_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -83,10 +83,10 @@ void bli_trmm_ll_ker_var2 const void* buf_beta = bli_obj_internal_scalar_buffer( c ); // Alias some constants to simpler names. - const dim_t MR = pd_a; - const dim_t NR = pd_b; - const dim_t PACKMR = cs_a; - const dim_t PACKNR = rs_b; + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; // Query the context for the micro-kernel address and cast it to its // function pointer type. @@ -140,50 +140,45 @@ void bli_trmm_ll_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) // loop around the microkernel. Here we query the thrinfo_t node for the // 1st (ir) loop around the microkernel. - //thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread ); + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - //dim_t ir_nt = bli_thrinfo_n_way( ir_thread ); - //dim_t ir_tid = bli_thrinfo_work_id( ir_thread ); + //const dim_t jr_nt = bli_thrinfo_n_way( thread ); + //const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - dim_t jr_start, jr_end; - //dim_t ir_start, ir_end; - dim_t jr_inc; + dim_t jr_start, jr_end, jr_inc; // Determine the thread range and increment for the 2nd loop. - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. // NOTE: Parallelism in the 1st loop is disabled for now. - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - //bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -191,20 +186,24 @@ void bli_trmm_ll_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; + // Initialize pointers for stepping through the block of A and current + // column of microtiles of C. const char* a1 = a_cast; char* c11 = c1; // Loop over the m dimension (MR rows at a time). for ( dim_t i = 0; i < m_iter; ++i ) { - doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); // If the current panel of A intersects the diagonal, scale C // by beta. If it is strictly below the diagonal, scale by one. @@ -215,8 +214,8 @@ void bli_trmm_ll_ker_var2 // Determine the offset to and length of the panel that was // packed so we can index into the corresponding location in // b1. - dim_t off_a1011 = 0; - dim_t k_a1011 = bli_min( diagoffa_i + MR, k ); + const dim_t off_a1011 = 0; + const dim_t k_a1011 = bli_min( diagoffa_i + MR, k ); // Compute the panel stride for the current diagonal- // intersecting micro-panel. @@ -230,13 +229,13 @@ void bli_trmm_ll_ker_var2 const char* b1_i = b1 + off_a1011 * PACKNR * dt_size; // Compute the addresses of the next panels of A and B. - const char* a2 = a1; - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_slrr( i, m_iter, 0, 1 ) ) { a2 = a_cast; - b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + //if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) + // b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t @@ -268,13 +267,13 @@ void bli_trmm_ll_ker_var2 //if ( bli_trmm_my_iter( i, ir_thread ) ) { // Compute the addresses of the next panels of A and B. - const char* a2 = a1; - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_slrr( i, m_iter, 0, 1 ) ) { a2 = a_cast; - b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + //if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) + // b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t @@ -306,6 +305,6 @@ void bli_trmm_ll_ker_var2 } } -//PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" ); -//PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" ); +//PASTEMAC(ch,printm)( "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,printm)( "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" ); diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2b.c b/frame/3/trmm/bli_trmm_ll_ker_var2b.c new file mode 100644 index 0000000000..bb6de00f5b --- /dev/null +++ b/frame/3/trmm/bli_trmm_ll_ker_var2b.c @@ -0,0 +1,365 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_ll_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current block of A is entirely above the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; + + // If there is a zero region above where the diagonal of A intersects the + // left edge of the block, adjust the pointer to C and treat this case as + // if the diagonal offset were zero. This skips over the region that was + // not packed. (Note we assume the diagonal offset is a multiple of MR; + // this assumption will hold as long as the cache blocksizes KC nd MC are + // each a multiple of MR.) + if ( diagoffa < 0 ) + { + m += diagoffa; + c_cast -= diagoffa * rs_c * dt_size; + diagoffa = 0; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Computing the number of MR x MR tiles in the k dimension is needed + // when computing the thread ranges below. + const dim_t k_iter = k / MR + ( k % MR ? 1 : 0 ); + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for the JR loop. + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_trmm_ll( jr_nt, jr_tid, diagoffa, m_iter, n_iter, k_iter, + MR, NR, &jr_st, &ir_st ); + +#if 0 + printf( "tid: %ld m,n,k_iter: %ld %ld %ld\n", tid, m_iter, n_iter, k_iter ); + printf( "tid: %ld trmm_ll_tlb begins at: %ld %ld (n_ut: %ld)\n", + tid, jr_st, ir_st, n_ut_for_me ); +#endif + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb_trmm_ll(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + const char* a1 = a_cast; + + // Get pointers into position by stepping through to the ith micropanel of + // A and ith microtile of C (within the appropriate column of microtiles). + for ( dim_t ii = 0; ii < ir_st; ++ii ) + { + const doff_t diagoffa_ii = diagoffa + ( doff_t )ii*MR; + + if ( bli_intersects_diag_n( diagoffa_ii, MR, k ) ) + { + // Determine the length of the panel that was packed. + const dim_t k_a1011 = bli_min( diagoffa_ii + MR, k ); + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_a_cur = k_a1011 * PACKMR; + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); + ps_a_cur *= dt_size; + + a1 += ps_a_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffa_ii, MR, k ) ) + { + a1 += rstep_a; + } + } + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + char* c11 = c1 + i * rstep_c; + + const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // If the current panel of A intersects the diagonal, scale C + // by beta. If it is strictly below the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) + { + // Determine the offset to and length of the panel that was + // packed so we can index into the corresponding location in B. + const dim_t off_a1011 = 0; + const dim_t k_a1011 = bli_min( diagoffa_i + MR, k ); + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_a_cur = k_a1011 * PACKMR; + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); + ps_a_cur *= dt_size; + + const char* b1_i = b1 + off_a1011 * PACKNR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, ps_a_cur, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_a1011, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1_i, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + + a1 += ps_a_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) + { + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + + a1 += rstep_a; + } + } + + // Upon reaching the end of the column of microtiles, reset the ir + // loop index so that we're ready to start the next pass through the + // m dimension (i.e., the next jr loop iteration). + i = 0; + + // Reset the a1 pointer to the beginning of the packed matrix A. + a1 = a_cast; + } +} + +//PASTEMAC(ch,printm)( "trmm_ll_ker_var2b: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,printm)( "trmm_ll_ker_var2b: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" ); + +//printf( "tid: %ld intersects diag. j,i: %ld %ld (ut: %ld)\n", tid, j, i, ut ); +//printf( "tid: %ld strictbelow diag j,i: %ld %ld (ut: %ld)\n", tid, j, i, ut ); + +//printf( "tid: %ld incrementing by ps_a_cur: %ld (k_a1011: %ld)\n", +// tid, ps_a_cur, k_a1011 ); +//printf( "tid: %ld incrementing by rstep_a: %ld (k : %ld)\n", +// tid, rstep_a, k ); + diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c index 265e21a66a..039bcc2926 100644 --- a/frame/3/trmm/bli_trmm_lu_ker_var2.c +++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c @@ -37,11 +37,11 @@ void bli_trmm_lu_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -83,10 +83,10 @@ void bli_trmm_lu_ker_var2 const void* buf_beta = bli_obj_internal_scalar_buffer( c ); // Alias some constants to simpler names. - const dim_t MR = pd_a; - const dim_t NR = pd_b; - const dim_t PACKMR = cs_a; - const dim_t PACKNR = rs_b; + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; // Query the context for the micro-kernel address and cast it to its // function pointer type. @@ -147,50 +147,45 @@ void bli_trmm_lu_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) // loop around the microkernel. Here we query the thrinfo_t node for the // 1st (ir) loop around the microkernel. - //thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread ); + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - //dim_t ir_nt = bli_thrinfo_n_way( ir_thread ); - //dim_t ir_tid = bli_thrinfo_work_id( ir_thread ); + //const dim_t jr_nt = bli_thrinfo_n_way( thread ); + //const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - dim_t jr_start, jr_end; - //dim_t ir_start, ir_end; - dim_t jr_inc; + dim_t jr_start, jr_end, jr_inc; // Determine the thread range and increment for the 2nd loop. - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. // NOTE: Parallelism in the 1st loop is disabled for now. - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - //bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -198,20 +193,24 @@ void bli_trmm_lu_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; + // Initialize pointers for stepping through the block of A and current + // column of microtiles of C. const char* a1 = a_cast; char* c11 = c1; // Loop over the m dimension (MR rows at a time). for ( dim_t i = 0; i < m_iter; ++i ) { - doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); // If the current panel of A intersects the diagonal, scale C // by beta. If it is strictly above the diagonal, scale by one. @@ -222,8 +221,8 @@ void bli_trmm_lu_ker_var2 // Determine the offset to and length of the panel that was // packed so we can index into the corresponding location in // b1. - dim_t off_a1112 = diagoffa_i; - dim_t k_a1112 = k - off_a1112; + const dim_t off_a1112 = diagoffa_i; + const dim_t k_a1112 = k - off_a1112; // Compute the panel stride for the current diagonal- // intersecting micro-panel. @@ -237,13 +236,13 @@ void bli_trmm_lu_ker_var2 const char* b1_i = b1 + off_a1112 * PACKNR * dt_size; // Compute the addresses of the next panels of A and B. - const char* a2 = a1; - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_slrr( i, m_iter, 0, 1 ) ) { a2 = a_cast; - b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + //if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) + // b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t @@ -275,13 +274,13 @@ void bli_trmm_lu_ker_var2 //if ( bli_trmm_my_iter( i, ir_thread ) ) { // Compute the addresses of the next panels of A and B. - const char* a2 = a1; - if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_slrr( i, m_iter, 0, 1 ) ) { a2 = a_cast; - b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + //if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) + // b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t @@ -313,6 +312,6 @@ void bli_trmm_lu_ker_var2 } } -//PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" ); -//PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" ); +//PASTEMAC(ch,printm)( "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,printm)( "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" ); diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2b.c b/frame/3/trmm/bli_trmm_lu_ker_var2b.c new file mode 100644 index 0000000000..39640ad6bf --- /dev/null +++ b/frame/3/trmm/bli_trmm_lu_ker_var2b.c @@ -0,0 +1,366 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_lu_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffa = bli_obj_diag_offset( a ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current block of A is entirely below the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; + + // If there is a zero region to the left of where the diagonal of A + // intersects the top edge of the block, adjust the pointer to B and + // treat this case as if the diagonal offset were zero. Note that we + // don't need to adjust the pointer to A since packm would have simply + // skipped over the region that was not stored. (Note we assume the + // diagonal offset is a multiple of MR; this assumption will hold as + // long as the cache blocksizes KC nd MC are each a multiple of MR.) + if ( diagoffa > 0 ) + { + k -= diagoffa; + b_cast += diagoffa * PACKNR * dt_size; + diagoffa = 0; + } + + // If there is a zero region below where the diagonal of A intersects the + // right side of the block, shrink it to prevent "no-op" iterations from + // executing. + if ( -diagoffa + k < m ) + { + m = -diagoffa + k; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Computing the number of MR x MR tiles in the k dimension is needed + // when computing the thread ranges below. + const dim_t k_iter = k / MR + ( k % MR ? 1 : 0 ); + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_trmm_lu( jr_nt, jr_tid, diagoffa, m_iter, n_iter, k_iter, + MR, NR, &jr_st, &ir_st ); + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb_trmm_ll(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + const char* a1 = a_cast; + + // Get pointers into position by stepping through to the ith micropanel of + // A and ith microtile of C (within the appropriate column of microtiles). + for ( dim_t ii = 0; ii < ir_st; ++ii ) + { + const doff_t diagoffa_ii = diagoffa + ( doff_t )ii*MR; + + if ( bli_intersects_diag_n( diagoffa_ii, MR, k ) ) + { + // Determine the length of the panel that was packed. + const dim_t k_a1112 = k - diagoffa_ii; + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_a_cur = k_a1112 * PACKMR; + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); + ps_a_cur *= dt_size; + + a1 += ps_a_cur; + } + else if ( bli_is_strictly_above_diag_n( diagoffa_ii, MR, k ) ) + { + a1 += rstep_a; + } + } + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + char* c11 = c1 + i * rstep_c; + + const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // If the current panel of A intersects the diagonal, scale C + // by beta. If it is strictly above the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) + { + // Determine the offset to and length of the panel that was + // packed so we can index into the corresponding location in B. + const dim_t off_a1112 = diagoffa_i; + const dim_t k_a1112 = k - off_a1112; + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_a_cur = k_a1112 * PACKMR; + ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 ); + ps_a_cur *= dt_size; + + const char* b1_i = b1 + off_a1112 * PACKNR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, ps_a_cur, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_a1112, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1_i, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + + a1 += ps_a_cur; + } + else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) + { + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + + a1 += rstep_a; + } + } + + // Upon reaching the end of the column of microtiles, reset the ir + // loop index so that we're ready to start the next pass through the + // m dimension (i.e., the next jr loop iteration). + i = 0; + + // Reset the a1 pointer to the beginning of the packed matrix A. + a1 = a_cast; + } +} + +//PASTEMAC(ch,printm)( "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,printm)( "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" ); + +#if 0 + printf( "tid: %ld m,n,k_iter: %ld %ld %ld\n", tid, m_iter, n_iter, k_iter ); + printf( "tid: %ld trmm_lu_tlb begins at: %ld %ld (n_ut: %ld)\n", + tid, jr_st, ir_st, n_ut_for_me ); +#endif + diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c index 785f2cf5fd..f8d0fc6c85 100644 --- a/frame/3/trmm/bli_trmm_rl_ker_var2.c +++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c @@ -37,11 +37,11 @@ void bli_trmm_rl_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -147,39 +147,40 @@ void bli_trmm_rl_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t ir_nt = bli_thrinfo_n_way( caucus ); - dim_t ir_tid = bli_thrinfo_work_id( caucus ); + // Query the number of threads and thread ids for each loop. + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - dim_t jr_start, jr_end; - dim_t ir_start, ir_end; - dim_t jr_inc, ir_inc; + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; // Note that we partition the 2nd loop into two regions: the rectangular // part of B, and the triangular portion. @@ -207,11 +208,11 @@ void bli_trmm_rl_ker_var2 // Determine the thread range and increment for the 2nd and 1st loops for // the initial rectangular region of B (if it exists). - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. // NOTE: Parallelism in the 1st loop is disabled for now. - bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + bli_thread_range_slrr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -219,7 +220,7 @@ void bli_trmm_rl_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -231,15 +232,15 @@ void bli_trmm_rl_ker_var2 const char* a1 = a_cast + i * rstep_a; char* c11 = c1 + i * rstep_c; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); // Compute the addresses of the next panels of A and B. const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + if ( bli_is_last_iter_slrr( i, m_iter, ir_tid, ir_nt ) ) { a2 = a_cast; b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) b2 = b_cast; } @@ -271,7 +272,7 @@ void bli_trmm_rl_ker_var2 // Use round-robin assignment of micropanels to threads in the 2nd and // 1st loops for the remaining triangular region of B (if it exists). - // NOTE: We don't need to call bli_thread_range_jrir_rr() here since we + // NOTE: We don't need to call bli_thread_range_rr() here since we // employ a hack that calls for each thread to execute every iteration // of the jr and ir loops but skip all but the pointer increment for // iterations that are not assigned to it. @@ -285,18 +286,18 @@ void bli_trmm_rl_ker_var2 // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < n_iter; ++j ) { - doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; // Determine the offset to the beginning of the panel that // was packed so we can index into the corresponding location // in A. Then compute the length of that panel. - dim_t off_b1121 = bli_max( -diagoffb_j, 0 ); - dim_t k_b1121 = k - off_b1121; + const dim_t off_b1121 = bli_max( -diagoffb_j, 0 ); + const dim_t k_b1121 = k - off_b1121; const char* a1 = a_cast; char* c11 = c1; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -319,7 +320,7 @@ void bli_trmm_rl_ker_var2 { if ( bli_trmm_my_iter_rr( i, caucus ) ) { - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); const char* a1_i = a1 + off_b1121 * PACKMR * dt_size; diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2b.c b/frame/3/trmm/bli_trmm_rl_ker_var2b.c new file mode 100644 index 0000000000..7f2757c3af --- /dev/null +++ b/frame/3/trmm/bli_trmm_rl_ker_var2b.c @@ -0,0 +1,392 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_rl_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of B is entirely above the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; + + // If there is a zero region above where the diagonal of B intersects + // the left edge of the panel, adjust the pointer to A and treat this + // case as if the diagonal offset were zero. Note that we don't need to + // adjust the pointer to B since packm would have simply skipped over + // the region that was not stored. (Note we assume the diagonal offset + // is a multiple of NR; this assumption will hold as long as the cache + // blocksizes KC and NC are each a multiple of NR.) + if ( diagoffb < 0 ) + { + k += diagoffb; + a_cast -= diagoffb * PACKMR * dt_size; + diagoffb = 0; + } + + // If there is a zero region to the right of where the diagonal + // of B intersects the bottom of the panel, shrink it to prevent + // "no-op" iterations from executing. + if ( diagoffb + k < n ) + { + n = diagoffb + k; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Computing the number of NR x NR tiles in the k dimension is needed + // when computing the thread ranges below. + const dim_t k_iter = k / NR + ( k % NR ? 1 : 0 ); + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel while the 'caucus' points to the thrinfo_t + // node for the 1st loop (ir). + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. +#if 0 +{ + const dim_t jr_nt = 17; + const dim_t jr_tid = jr_nt - 1; + + const doff_t m_iter = 10; + const doff_t k_iter = 10; + const doff_t n_iter = 20; + + diagoffb = 30 * NR; +#else + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); +#endif + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_trmm_rl( jr_nt, jr_tid, diagoffb, m_iter, n_iter, k_iter, + MR, NR, &jr_st, &ir_st ); + +#if 0 + printf( "tid %ld: final range: jr_st, ir_st: %ld %ld (n_ut_for_me: %ld)\n", + jr_tid, jr_st, ir_st, n_ut_for_me ); + return; +} +const dim_t n_ut_for_me = -1; dim_t jr_st, ir_st; +#endif + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb_trmm_r(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + const char* b1 = b_cast; + + // Get pointers into position by stepping through to the jth micropanel of + // B and jth microtile of C (within the appropriate row of microtiles). + for ( dim_t jj = 0; jj < jr_st; ++jj ) + { + const doff_t diagoffb_jj = diagoffb - ( doff_t )jj*NR; + + if ( bli_intersects_diag_n( diagoffb_jj, k, NR ) ) + { + // Determine the length of the panel that was packed. + const dim_t off_b1121 = bli_max( -diagoffb_jj, 0 ); + const dim_t k_b1121 = k - off_b1121; + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b1121 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + b1 += ps_b_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffb_jj, k, NR ) ) + { + b1 += cstep_b; + } + } + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + char* c1 = c_cast + j * cstep_c; + + const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Determine the offset to and length of the panel that was packed + // so we can index into the corresponding location in A. + const dim_t off_b1121 = bli_max( -diagoffb_j, 0 ); + const dim_t k_b1121 = k - off_b1121; + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // If the current panel of B intersects the diagonal, scale C + // by beta. If it is strictly below the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) + { + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b1121 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + const char* a1_i = a1 + off_b1121 * PACKMR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, ps_b_cur, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_b1121, + ( void* )alpha_cast, + ( void* )a1_i, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + } + + // Upon reaching the end of the column of microtiles, reset the ir + // loop index so that we're ready to start the next pass through the + // m dimension (i.e., the next jr loop iteration). + i = 0; + + b1 += ps_b_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) + { + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + } + + // Upon reaching the end of the column of microtiles, reset the ir + // loop index so that we're ready to start the next pass through the + // m dimension (i.e., the next jr loop iteration). + i = 0; + + b1 += cstep_b; + } + } +} + +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" ); + diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c index ca27caef10..a031b67947 100644 --- a/frame/3/trmm/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c @@ -37,11 +37,11 @@ void bli_trmm_ru_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -148,25 +148,23 @@ void bli_trmm_ru_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); @@ -177,14 +175,13 @@ void bli_trmm_ru_ker_var2 thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t ir_nt = bli_thrinfo_n_way( caucus ); - dim_t ir_tid = bli_thrinfo_work_id( caucus ); + //const dim_t jr_nt = bli_thrinfo_n_way( thread ); + //const dim_t jr_tid = bli_thrinfo_work_id( thread ); + const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + const dim_t ir_tid = bli_thrinfo_work_id( caucus ); - dim_t jr_start, jr_end; - dim_t ir_start, ir_end; - dim_t jr_inc, ir_inc; + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; // Note that we partition the 2nd loop into two regions: the triangular // part of C, and the rectangular portion. @@ -212,7 +209,7 @@ void bli_trmm_ru_ker_var2 // Use round-robin assignment of micropanels to threads in the 2nd and // 1st loops for the initial triangular region of B (if it exists). - // NOTE: We don't need to call bli_thread_range_jrir_rr() here since we + // NOTE: We don't need to call bli_thread_range_rr() here since we // employ a hack that calls for each thread to execute every iteration // of the jr and ir loops but skip all but the pointer increment for // iterations that are not assigned to it. @@ -223,17 +220,18 @@ void bli_trmm_ru_ker_var2 // Loop over the n dimension (NR columns at a time). for ( dim_t j = 0; j < n_iter_tri; ++j ) { - doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; // Determine the offset to and length of the panel that was packed // so we can index into the corresponding location in A. - dim_t off_b0111 = 0; - dim_t k_b0111 = bli_min( k, -diagoffb_j + NR ); + const dim_t off_b0111 = 0; + const dim_t k_b0111 = bli_min( k, -diagoffb_j + NR ); const char* a1 = a_cast; char* c11 = c1; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -256,7 +254,8 @@ void bli_trmm_ru_ker_var2 { if ( bli_trmm_my_iter_rr( i, caucus ) ) { - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); const char* a1_i = a1 + off_b0111 * PACKMR * dt_size; @@ -266,8 +265,6 @@ void bli_trmm_ru_ker_var2 { a2 = a_cast; b2 = b1; - if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t @@ -307,11 +304,11 @@ void bli_trmm_ru_ker_var2 // Determine the thread range and increment for the 2nd and 1st loops for // the remaining rectangular region of B. - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. // NOTE: Parallelism in the 1st loop is disabled for now. - bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); - bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + bli_thread_range_slrr( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); // Advance the start and end iteration offsets for the rectangular region // by the number of iterations used for the triangular region. @@ -332,7 +329,8 @@ void bli_trmm_ru_ker_var2 b1 = b_cast + (j-jb0) * cstep_b; c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -348,16 +346,15 @@ void bli_trmm_ru_ker_var2 const char* a1 = a_cast + i * rstep_a; char* c11 = c1 + i * rstep_c; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); // Compute the addresses of the next panels of A and B. const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); - if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + if ( bli_is_last_iter_slrr( i, m_iter, ir_tid, ir_nt ) ) { a2 = a_cast; b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) - b2 = b_cast; } // Save addresses of next panels of A and B to the auxinfo_t diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2b.c b/frame/3/trmm/bli_trmm_ru_ker_var2b.c new file mode 100644 index 0000000000..8aae2386aa --- /dev/null +++ b/frame/3/trmm/bli_trmm_ru_ker_var2b.c @@ -0,0 +1,390 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_ru_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of B is entirely below its diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; + + // If there is a zero region to the left of where the diagonal of B + // intersects the top edge of the panel, adjust the pointer to C and + // treat this case as if the diagonal offset were zero. This skips over + // the region that was not packed. (Note we assume the diagonal offset + // is a multiple of NR; this assumption will hold as long as the cache + // blocksizes KC and NC are each a multiple of NR.) + if ( diagoffb > 0 ) + { + n -= diagoffb; + c_cast += diagoffb * cs_c * dt_size; + diagoffb = 0; + } + + // If there is a zero region below where the diagonal of B intersects the + // right side of the block, shrink it to prevent "no-op" iterations from + // executing. + if ( -diagoffb + n < k ) + { + k = -diagoffb + n; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Computing the number of NR x NR tiles in the k dimension is needed + // when computing the thread ranges below. + const dim_t k_iter = k / NR + ( k % NR ? 1 : 0 ); + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel. Here we query the thrinfo_t node for the + // 1st (ir) loop around the microkernel. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. +#if 0 +{ + const dim_t jr_nt = 1; + const dim_t jr_tid = 0; //jr_nt - 1; + + const doff_t m_iter = 10; + const doff_t k_iter = 10; + const doff_t n_iter = 20; + + diagoffb = 0 * NR; +#else + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); +#endif + dim_t jr_st, ir_st; + const dim_t n_ut_for_me + = + bli_thread_range_tlb_trmm_ru( jr_nt, jr_tid, diagoffb, m_iter, n_iter, k_iter, + MR, NR, &jr_st, &ir_st ); + +#if 0 + printf( "tid %ld: final range: jr_st, ir_st: %ld %ld (n_ut_for_me: %ld)\n", + jr_tid, jr_st, ir_st, n_ut_for_me ); + return; +} +const dim_t n_ut_for_me = -1; dim_t jr_st, ir_st; +#endif + + // It's possible that there are so few microtiles relative to the number + // of threads that one or more threads gets no work. If that happens, those + // threads can return early. + if ( n_ut_for_me == 0 ) return; + + // Start the jr/ir loops with the current thread's microtile offsets computed + // by bli_thread_range_tlb_trmm_r(). + dim_t i = ir_st; + dim_t j = jr_st; + + // Initialize a counter to track the number of microtiles computed by the + // current thread. + dim_t ut = 0; + + const char* b1 = b_cast; + + // Get pointers into position by stepping through to the jth micropanel of + // B and jth microtile of C (within the appropriate row of microtiles). + for ( dim_t jj = 0; jj < jr_st; ++jj ) + { + const doff_t diagoffb_jj = diagoffb - ( doff_t )jj*NR; + + if ( bli_intersects_diag_n( diagoffb_jj, k, NR ) ) + { + // Determine the length of the panel that was packed. + const dim_t k_b0111 = bli_min( k, -diagoffb_jj + NR ); + + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b0111 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + b1 += ps_b_cur; + } + else if ( bli_is_strictly_above_diag_n( diagoffb_jj, k, NR ) ) + { + b1 += cstep_b; + } + } + + // Loop over the n dimension (NR columns at a time). + for ( ; true; ++j ) + { + char* c1 = c_cast + j * cstep_c; + + const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Determine the offset to and length of the panel that was packed + // so we can index into the corresponding location in A. + const dim_t off_b0111 = 0; + const dim_t k_b0111 = bli_min( k, -diagoffb_j + NR ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + bli_auxinfo_set_next_b( b2, &aux ); + + // If the current panel of B intersects the diagonal, scale C + // by beta. If it is strictly above the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) + { + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b0111 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + const char* a1_i = a1 + off_b0111 * PACKMR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, ps_b_cur, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_b0111, + ( void* )alpha_cast, + ( void* )a1_i, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + } + + // Upon reaching the end of the column of microtiles, reset the ir + // loop index so that we're ready to start the next pass through the + // m dimension (i.e., the next jr loop iteration). + i = 0; + + b1 += ps_b_cur; + } + else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) + { + // Loop over the m dimension (MR rows at a time). + for ( ; i < m_iter; ++i ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter_sl( i, m_iter ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, 1 ); + bli_auxinfo_set_next_b( b2, &aux ); + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + // Increment the microtile counter and check if the thread is done. + ut += 1; if ( ut == n_ut_for_me ) return; + } + + // Upon reaching the end of the column of microtiles, reset the ir + // loop index so that we're ready to start the next pass through the + // m dimension (i.e., the next jr loop iteration). + i = 0; + + b1 += cstep_b; + } + } +} + +//PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" ); + diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h index f8c3d7ee20..0a605ba86a 100644 --- a/frame/3/trmm/bli_trmm_var.h +++ b/frame/3/trmm/bli_trmm_var.h @@ -43,54 +43,23 @@ \ void PASTEMAC0(opname) \ ( \ - const obj_t* a, \ - const obj_t* b, \ - const obj_t* c, \ - const cntx_t* cntx, \ - const cntl_t* cntl, \ - thrinfo_t* thread \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* c, \ + const cntx_t* cntx, \ + const cntl_t* cntl, \ + thrinfo_t* thread_par \ ); -//GENPROT( trmm_blk_var1 ) -//GENPROT( trmm_blk_var2 ) -//GENPROT( trmm_blk_var3 ) - GENPROT( trmm_xx_ker_var2 ) - GENPROT( trmm_ll_ker_var2 ) GENPROT( trmm_lu_ker_var2 ) GENPROT( trmm_rl_ker_var2 ) GENPROT( trmm_ru_ker_var2 ) - -// -// Prototype BLAS-like interfaces with void pointer operands. -// - -#undef GENTPROT -#define GENTPROT( ctype, ch, varname ) \ -\ -void PASTEMAC(ch,varname) \ - ( \ - doff_t diagoff, \ - pack_t schema_a, \ - pack_t schema_b, \ - dim_t m, \ - dim_t n, \ - dim_t k, \ - void* alpha, \ - void* a, inc_t cs_a, \ - dim_t pd_a, inc_t ps_a, \ - void* b, inc_t rs_b, \ - dim_t pd_b, inc_t ps_b, \ - void* beta, \ - void* c, inc_t rs_c, inc_t cs_c, \ - cntx_t* cntx, \ - thrinfo_t* thread \ - ); - -INSERT_GENTPROT_BASIC0( trmm_ll_ker_var2 ) -INSERT_GENTPROT_BASIC0( trmm_lu_ker_var2 ) -INSERT_GENTPROT_BASIC0( trmm_rl_ker_var2 ) -INSERT_GENTPROT_BASIC0( trmm_ru_ker_var2 ) +GENPROT( trmm_xx_ker_var2b ) +GENPROT( trmm_ll_ker_var2b ) +GENPROT( trmm_lu_ker_var2b ) +GENPROT( trmm_rl_ker_var2b ) +GENPROT( trmm_ru_ker_var2b ) diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c index 60030bf4aa..918b8f973e 100644 --- a/frame/3/trmm/bli_trmm_xx_ker_var2.c +++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c @@ -43,12 +43,12 @@ static l3_var_oft vars[2][2] = void bli_trmm_xx_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { dim_t side; @@ -81,7 +81,7 @@ void bli_trmm_xx_ker_var2 c, cntx, cntl, - thread + thread_par ); } diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2b.c b/frame/3/trmm/bli_trmm_xx_ker_var2b.c new file mode 100644 index 0000000000..57894165ce --- /dev/null +++ b/frame/3/trmm/bli_trmm_xx_ker_var2b.c @@ -0,0 +1,87 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +static l3_var_oft vars[2][2] = +{ + { bli_trmm_ll_ker_var2b, bli_trmm_lu_ker_var2b }, + { bli_trmm_rl_ker_var2b, bli_trmm_ru_ker_var2b } +}; + +void bli_trmm_xx_ker_var2b + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + dim_t side; + dim_t uplo; + + // Set two bools: one based on the implied side parameter (the structure + // of the root object) and one based on the uplo field of the triangular + // matrix's root object (whether that is matrix A or matrix B). + if ( bli_obj_root_is_triangular( a ) ) + { + side = 0; + if ( bli_obj_root_is_lower( a ) ) uplo = 0; + else uplo = 1; + } + else // if ( bli_obj_root_is_triangular( b ) ) + { + side = 1; + if ( bli_obj_root_is_lower( b ) ) uplo = 0; + else uplo = 1; + } + + // Index into the variant array to extract the correct function pointer. + l3_var_oft f = vars[side][uplo]; + + // Call the macrokernel. + f + ( + a, + b, + c, + cntx, + cntl, + thread_par + ); +} + diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.prev b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.prev new file mode 100644 index 0000000000..5aebe23c1c --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.prev @@ -0,0 +1,371 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_rl_ker_var2 + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of B is entirely above the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; + + // If there is a zero region above where the diagonal of B intersects + // the left edge of the panel, adjust the pointer to A and treat this + // case as if the diagonal offset were zero. Note that we don't need to + // adjust the pointer to B since packm would have simply skipped over + // the region that was not stored. + if ( diagoffb < 0 ) + { + k += diagoffb; + a_cast -= diagoffb * PACKMR * dt_size; + diagoffb = 0; + } + + // If there is a zero region to the right of where the diagonal + // of B intersects the bottom of the panel, shrink it to prevent + // "no-op" iterations from executing. + if ( diagoffb + k < n ) + { + n = diagoffb + k; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + dim_t n_iter = n / NR; + dim_t n_left = n % NR; + + dim_t m_iter = m / MR; + dim_t m_left = m % MR; + + if ( n_left ) ++n_iter; + if ( m_left ) ++m_iter; + + // Determine some increments used to step through A, B, and C. + inc_t rstep_a = ps_a * dt_size; + + inc_t cstep_b = ps_b * dt_size; + + inc_t rstep_c = rs_c * MR * dt_size; + inc_t cstep_c = cs_c * NR * dt_size; + + // Save the pack schemas of A and B to the auxinfo_t object. + auxinfo_t aux; + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + dim_t jr_nt = bli_thrinfo_n_way( thread ); + dim_t jr_tid = bli_thrinfo_work_id( thread ); + dim_t ir_nt = bli_thrinfo_n_way( caucus ); + dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + dim_t jr_start, jr_end; + dim_t ir_start, ir_end; + dim_t jr_inc, ir_inc; + + // Note that we partition the 2nd loop into two regions: the rectangular + // part of B, and the triangular portion. + dim_t n_iter_rct; + dim_t n_iter_tri; + + if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) + { + // If the entire panel of B does not intersect the diagonal, there is + // no triangular region, and therefore we can skip the second set of + // loops. + n_iter_rct = n_iter; + n_iter_tri = 0; + } + else + { + // If the panel of B does intersect the diagonal, compute the number of + // iterations in the rectangular region by dividing NR into the diagonal + // offset. (There should never be any remainder in this division.) The + // number of iterations in the triangular (or trapezoidal) region is + // computed as the remaining number of iterations in the n dimension. + n_iter_rct = diagoffb / NR; + n_iter_tri = n_iter - n_iter_rct; + } + + // Determine the thread range and increment for the 2nd and 1st loops for + // the initial rectangular region of B (if it exists). + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + // NOTE: Parallelism in the 1st loop is disabled for now. + bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* b1 = b_cast + j * cstep_b; + char* c1 = c_cast + j * cstep_c; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + { + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) + { + const char* a1 = a_cast + i * rstep_a; + char* c11 = c1 + i * rstep_c; + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); + if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + } + } + + // If there is no triangular region, then we're done. + if ( n_iter_tri == 0 ) return; + + // Use round-robin assignment of micropanels to threads in the 2nd and + // 1st loops for the remaining triangular region of B (if it exists). + // NOTE: We don't need to call bli_thread_range_jrir_rr() here since we + // employ a hack that calls for each thread to execute every iteration + // of the jr and ir loops but skip all but the pointer increment for + // iterations that are not assigned to it. + + // Advance the starting b1 and c1 pointers to the positions corresponding + // to the start of the triangular region of B. + jr_start = n_iter_rct; + const char* b1 = b_cast + jr_start * cstep_b; + char* c1 = c_cast + jr_start * cstep_c; + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < n_iter; ++j ) + { + doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + + // Determine the offset to the beginning of the panel that + // was packed so we can index into the corresponding location + // in A. Then compute the length of that panel. + dim_t off_b1121 = bli_max( -diagoffb_j, 0 ); + dim_t k_b1121 = k - off_b1121; + + const char* a1 = a_cast; + char* c11 = c1; + + dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + // If the current panel of B intersects the diagonal, scale C + // by beta. If it is strictly below the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + { + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b1121 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + if ( bli_trmm_my_iter_rr( j, thread ) ) { + + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + { + if ( bli_trmm_my_iter_rr( i, caucus ) ) { + + dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + + const char* a1_i = a1 + off_b1121 * PACKMR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = a1; + if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = b1; + if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) + b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_b1121, + ( void* )alpha_cast, + ( void* )a1_i, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + } + + a1 += rstep_a; + c11 += rstep_c; + } + } + + b1 += ps_b_cur; + } + + c1 += cstep_c; + } +} + +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" ); + diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.unified b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.unified new file mode 100644 index 0000000000..7d2aabaa4b --- /dev/null +++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c.unified @@ -0,0 +1,324 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_trmm_rl_ker_var2 + ( + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par + ) +{ + const num_t dt = bli_obj_exec_dt( c ); + const dim_t dt_size = bli_dt_size( dt ); + + doff_t diagoffb = bli_obj_diag_offset( b ); + + const pack_t schema_a = bli_obj_pack_schema( a ); + const pack_t schema_b = bli_obj_pack_schema( b ); + + dim_t m = bli_obj_length( c ); + dim_t n = bli_obj_width( c ); + dim_t k = bli_obj_width( a ); + + const void* buf_a = bli_obj_buffer_at_off( a ); + const inc_t cs_a = bli_obj_col_stride( a ); + const dim_t pd_a = bli_obj_panel_dim( a ); + const inc_t ps_a = bli_obj_panel_stride( a ); + + const void* buf_b = bli_obj_buffer_at_off( b ); + const inc_t rs_b = bli_obj_row_stride( b ); + const dim_t pd_b = bli_obj_panel_dim( b ); + const inc_t ps_b = bli_obj_panel_stride( b ); + + void* buf_c = bli_obj_buffer_at_off( c ); + const inc_t rs_c = bli_obj_row_stride( c ); + const inc_t cs_c = bli_obj_col_stride( c ); + + // Detach and multiply the scalars attached to A and B. + obj_t scalar_a, scalar_b; + bli_obj_scalar_detach( a, &scalar_a ); + bli_obj_scalar_detach( b, &scalar_b ); + bli_mulsc( &scalar_a, &scalar_b ); + + // Grab the addresses of the internal scalar buffers for the scalar + // merged above and the scalar attached to C. + const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b ); + const void* buf_beta = bli_obj_internal_scalar_buffer( c ); + + // Alias some constants to simpler names. + const dim_t MR = pd_a; + const dim_t NR = pd_b; + const dim_t PACKMR = cs_a; + const dim_t PACKNR = rs_b; + + // Query the context for the micro-kernel address and cast it to its + // function pointer type. + gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); + + const void* one = bli_obj_buffer_for_const( dt, &BLIS_ONE ); + const char* a_cast = buf_a; + const char* b_cast = buf_b; + char* c_cast = buf_c; + const char* alpha_cast = buf_alpha; + const char* beta_cast = buf_beta; + + /* + Assumptions/assertions: + rs_a == 1 + cs_a == PACKMR + pd_a == MR + ps_a == stride to next micro-panel of A + rs_b == PACKNR + cs_b == 1 + pd_b == NR + ps_b == stride to next micro-panel of B + rs_c == (no assumptions) + cs_c == (no assumptions) + */ + + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. + if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || + ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); + + // If any dimension is zero, return immediately. + if ( bli_zero_dim3( m, n, k ) ) return; + + // Safeguard: If the current panel of B is entirely above the diagonal, + // it is implicitly zero. So we do nothing. + if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; + + // If there is a zero region above where the diagonal of B intersects + // the left edge of the panel, adjust the pointer to A and treat this + // case as if the diagonal offset were zero. Note that we don't need to + // adjust the pointer to B since packm would have simply skipped over + // the region that was not stored. + if ( diagoffb < 0 ) + { + k += diagoffb; + a_cast -= diagoffb * PACKMR * dt_size; + diagoffb = 0; + } + + // If there is a zero region to the right of where the diagonal + // of B intersects the bottom of the panel, shrink it to prevent + // "no-op" iterations from executing. + if ( diagoffb + k < n ) + { + n = diagoffb + k; + } + + // Compute number of primary and leftover components of the m and n + // dimensions. + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; + + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; + + // Determine some increments used to step through A, B, and C. + const inc_t rstep_a = ps_a * dt_size; + + const inc_t cstep_b = ps_b * dt_size; + + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; + + auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + bli_auxinfo_set_schema_a( schema_a, &aux ); + bli_auxinfo_set_schema_b( schema_b, &aux ); + + // The 'thread' argument points to the thrinfo_t node for the 2nd (jr) + // loop around the microkernel while the 'caucus' points to the thrinfo_t + // node for the 1st loop (ir). + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); + thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); + + // Query the number of threads and thread ids for each loop. + //const dim_t jr_nt = bli_thrinfo_n_way( thread ); + //const dim_t jr_tid = bli_thrinfo_work_id( thread ); + //const dim_t ir_nt = bli_thrinfo_n_way( caucus ); + //const dim_t ir_tid = bli_thrinfo_work_id( caucus ); + + dim_t jr_start, jr_end, jr_inc; + dim_t ir_start, ir_end, ir_inc; + + // Determine the thread range and increment for the 2nd and 1st loops. + // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // slab or round-robin partitioning was requested at configure-time. + // NOTE: Parallelism in the 1st loop is disabled for now. + bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_jrir( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc ); + + const char* b1 = b_cast; + // char* c1 = c_cast; + + // Loop over the n dimension (NR columns at a time). + for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) + { + const char* a1 = a_cast; + char* c1 = c_cast + j * cstep_c; + char* c11 = c1; + + const doff_t diagoffb_j = diagoffb - ( doff_t )j*NR; + + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); + + // Determine the offset to the beginning of the panel that + // was packed so we can index into the corresponding location + // in A. Then compute the length of that panel. + const dim_t off_b1121 = bli_max( -diagoffb_j, 0 ); + const dim_t k_b1121 = k - off_b1121; + + // Initialize our next panel of B to be the current panel of B. + const char* b2 = b1; + + // If the current panel of B intersects the diagonal, scale C + // by beta. If it is strictly below the diagonal, scale by one. + // This allows the current macro-kernel to work for both trmm + // and trmm3. + if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) + { + // Compute the panel stride for the current diagonal- + // intersecting micro-panel. + inc_t ps_b_cur = k_b1121 * PACKNR; + ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); + ps_b_cur *= dt_size; + + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + //for ( dim_t i = ir_start; i < ir_end; i += ir_inc ) + { + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + const char* a1_i = a1 + off_b1121 * PACKMR * dt_size; + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + //if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + // b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k_b1121, + ( void* )alpha_cast, + ( void* )a1_i, + ( void* )b1, + ( void* )beta_cast, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + a1 += rstep_a; + c11 += rstep_c; + } + + b1 += ps_b_cur; + } + else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) + { + // Loop over the m dimension (MR rows at a time). + for ( dim_t i = 0; i < m_iter; ++i ) + { + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); + + // Compute the addresses of the next panels of A and B. + const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, 1 ); + if ( bli_is_last_iter( i, m_iter, 0, 1 ) ) + { + a2 = a_cast; + b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); + //if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + // b2 = b_cast; + } + + // Save addresses of next panels of A and B to the auxinfo_t + // object. + bli_auxinfo_set_next_a( a2, &aux ); + bli_auxinfo_set_next_b( b2, &aux ); + + // Invoke the gemm micro-kernel. + gemm_ukr + ( + m_cur, + n_cur, + k, + ( void* )alpha_cast, + ( void* )a1, + ( void* )b1, + ( void* )one, + c11, rs_c, cs_c, + &aux, + ( cntx_t* )cntx + ); + + a1 += rstep_a; + c11 += rstep_c; + } + + b1 += cstep_b; + } + + //c1 += cstep_c; + } +} + +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" ); +//PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" ); + diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c index 275d6ca470..45af769104 100644 --- a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c +++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c @@ -356,7 +356,7 @@ void PASTEMAC(ch,varname) \ b2 = b1; \ \ /* If the current panel of B intersects the diagonal, scale C - by beta. If it is strictly below the diagonal, scale by one. + by beta. If it is strictly above the diagonal, scale by one. This allows the current macro-kernel to work for both trmm and trmm3. */ \ if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \ diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c index e2128f1009..786e4f343f 100644 --- a/frame/3/trsm/bli_trsm_ll_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c @@ -37,11 +37,11 @@ void bli_trsm_ll_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -158,47 +158,44 @@ void bli_trsm_ll_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); // We don't bother querying the thrinfo_t node for the 1st loop because // we can't parallelize that loop in trsm due to the inter-iteration // dependencies that exist. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t jr_start, jr_end; - dim_t jr_inc; + dim_t jr_start, jr_end, jr_inc; // Determine the thread range and increment for the 2nd loop. - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. // NOTE: Parallelism in the 1st loop is unattainable due to the // inter-iteration dependencies present in trsm. - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -206,7 +203,8 @@ void bli_trsm_ll_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -217,9 +215,10 @@ void bli_trsm_ll_ker_var2 // Loop over the m dimension (MR rows at a time). for ( dim_t i = 0; i < m_iter; ++i ) { - doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; - dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) + ? MR : m_left ); // If the current panel of A intersects the diagonal, use a // special micro-kernel that performs a fused gemm and trsm. @@ -230,10 +229,10 @@ void bli_trsm_ll_ker_var2 if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) { // Compute various offsets into and lengths of parts of A. - dim_t off_a10 = 0; - dim_t k_a1011 = diagoffa_i + MR; - dim_t k_a10 = k_a1011 - MR; - dim_t off_a11 = k_a10; + const dim_t off_a10 = 0; + const dim_t k_a1011 = diagoffa_i + MR; + const dim_t k_a10 = k_a1011 - MR; + const dim_t off_a11 = k_a10; // Compute the panel stride for the current diagonal- // intersecting micro-panel. @@ -258,7 +257,7 @@ void bli_trsm_ll_ker_var2 { a2 = a_cast; b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) b2 = b_cast; } @@ -292,7 +291,7 @@ void bli_trsm_ll_ker_var2 { a2 = a_cast; b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) b2 = b_cast; } diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c index 314ee30706..ebf44905b4 100644 --- a/frame/3/trsm/bli_trsm_lu_ker_var2.c +++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c @@ -37,11 +37,11 @@ void bli_trsm_lu_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -169,47 +169,44 @@ void bli_trsm_lu_ker_var2 // Compute number of primary and leftover components of the m and n // dimensions. - dim_t n_iter = n / NR; - dim_t n_left = n % NR; + const dim_t n_iter = n / NR + ( n % NR ? 1 : 0 ); + const dim_t n_left = n % NR; - dim_t m_iter = m / MR; - dim_t m_left = m % MR; - - if ( n_left ) ++n_iter; - if ( m_left ) ++m_iter; + const dim_t m_iter = m / MR + ( m % MR ? 1 : 0 ); + const dim_t m_left = m % MR; // Determine some increments used to step through A, B, and C. - inc_t rstep_a = ps_a * dt_size; + const inc_t rstep_a = ps_a * dt_size; - inc_t cstep_b = ps_b * dt_size; + const inc_t cstep_b = ps_b * dt_size; - inc_t rstep_c = rs_c * MR * dt_size; - inc_t cstep_c = cs_c * NR * dt_size; + const inc_t rstep_c = rs_c * MR * dt_size; + const inc_t cstep_c = cs_c * NR * dt_size; - // Save the pack schemas of A and B to the auxinfo_t object. auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. bli_auxinfo_set_schema_a( schema_a, &aux ); bli_auxinfo_set_schema_b( schema_b, &aux ); // We don't bother querying the thrinfo_t node for the 1st loop because // we can't parallelize that loop in trsm due to the inter-iteration // dependencies that exist. + thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); //thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); // Query the number of threads and thread ids for each loop. - thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); - dim_t jr_nt = bli_thrinfo_n_way( thread ); - dim_t jr_tid = bli_thrinfo_work_id( thread ); + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t jr_tid = bli_thrinfo_work_id( thread ); - dim_t jr_start, jr_end; - dim_t jr_inc; + dim_t jr_start, jr_end, jr_inc; // Determine the thread range and increment for the 2nd loop. - // NOTE: The definition of bli_thread_range_jrir() will depend on whether + // NOTE: The definition of bli_thread_range_slrr() will depend on whether // slab or round-robin partitioning was requested at configure-time. // NOTE: Parallelism in the 1st loop is unattainable due to the // inter-iteration dependencies present in trsm. - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); // Loop over the n dimension (NR columns at a time). for ( dim_t j = jr_start; j < jr_end; j += jr_inc ) @@ -217,7 +214,8 @@ void bli_trsm_lu_ker_var2 const char* b1 = b_cast + j * cstep_b; char* c1 = c_cast + j * cstep_c; - dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); + const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) + ? NR : n_left ); // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; @@ -228,10 +226,11 @@ void bli_trsm_lu_ker_var2 // Loop over the m dimension (MR rows at a time). for ( dim_t ib = 0; ib < m_iter; ++ib ) { - dim_t i = m_iter - 1 - ib; - doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; + const dim_t i = m_iter - 1 - ib; + const doff_t diagoffa_i = diagoffa + ( doff_t )i*MR; - dim_t m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); + const dim_t m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) + ? MR : m_left ); // If the current panel of A intersects the diagonal, use a // special micro-kernel that performs a fused gemm and trsm. @@ -242,11 +241,11 @@ void bli_trsm_lu_ker_var2 if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) { // Compute various offsets into and lengths of parts of A. - dim_t off_a11 = diagoffa_i; - dim_t k_a1112 = k - off_a11;; - dim_t k_a11 = MR; - dim_t k_a12 = k_a1112 - MR; - dim_t off_a12 = off_a11 + k_a11; + const dim_t off_a11 = diagoffa_i; + const dim_t k_a1112 = k - off_a11;; + const dim_t k_a11 = MR; + const dim_t k_a12 = k_a1112 - MR; + const dim_t off_a12 = off_a11 + k_a11; // Compute the panel stride for the current diagonal- // intersecting micro-panel. @@ -271,7 +270,7 @@ void bli_trsm_lu_ker_var2 { a2 = a_cast; b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) b2 = b_cast; } @@ -305,7 +304,7 @@ void bli_trsm_lu_ker_var2 { a2 = a_cast; b2 = b1; - if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) + if ( bli_is_last_iter_slrr( j, n_iter, jr_tid, jr_nt ) ) b2 = b_cast; } diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c index 42e72840ef..073fe3ec07 100644 --- a/frame/3/trsm/bli_trsm_rl_ker_var2.c +++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c @@ -37,11 +37,11 @@ void bli_trsm_rl_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -131,23 +131,23 @@ void bli_trsm_rl_ker_var2 the right-hand side parameter case). */ - /* Safety trap: Certain indexing within this macro-kernel does not - work as intended if both MR and NR are odd. */ + // Safety trap: Certain indexing within this macro-kernel does not + // work as intended if both MR and NR are odd. if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); - /* If any dimension is zero, return immediately. */ + // If any dimension is zero, return immediately. if ( bli_zero_dim3( m, n, k ) ) return; - /* Safeguard: If the current panel of B is entirely above its diagonal, - it is implicitly zero. So we do nothing. */ + // Safeguard: If the current panel of B is entirely above its diagonal, + // it is implicitly zero. So we do nothing. if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; - /* If there is a zero region above where the diagonal of B intersects - the left edge of the panel, adjust the pointer to A and treat this - case as if the diagonal offset were zero. Note that we don't need to - adjust the pointer to B since packm would have simply skipped over - the region that was not stored. */ + // If there is a zero region above where the diagonal of B intersects + // the left edge of the panel, adjust the pointer to A and treat this + // case as if the diagonal offset were zero. Note that we don't need to + // adjust the pointer to B since packm would have simply skipped over + // the region that was not stored. if ( diagoffb < 0 ) { k += diagoffb; @@ -155,40 +155,40 @@ void bli_trsm_rl_ker_var2 diagoffb = 0; } - /* If there is a zero region to the right of where the diagonal - of B intersects the bottom of the panel, shrink it so that - we can index to the correct place in C (corresponding to the - part of the panel of B that was packed). - NOTE: This is NOT being done to skip over "no-op" iterations, - as with the trsm_lu macro-kernel. This MUST be done for correct - execution because we use n (via n_iter) to compute diagonal and - index offsets for backwards movement through B. */ + // If there is a zero region to the right of where the diagonal + // of B intersects the bottom of the panel, shrink it so that + // we can index to the correct place in C (corresponding to the + // part of the panel of B that was packed). + // NOTE: This is NOT being done to skip over "no-op" iterations, + // as with the trsm_lu macro-kernel. This MUST be done for correct + // execution because we use n (via n_iter) to compute diagonal and + // index offsets for backwards movement through B. if ( diagoffb + k < n ) { n = diagoffb + k; } - /* Check the k dimension, which needs to be a multiple of NR. If k - isn't a multiple of NR, we adjust it higher to satisfy the micro- - kernel, which is expecting to perform an NR x NR triangular solve. - This adjustment of k is consistent with what happened when B was - packed: all of its bottom/right edges were zero-padded, and - furthermore, the panel that stores the bottom-right corner of the - matrix has its diagonal extended into the zero-padded region (as - identity). This allows the trsm of that bottom-right panel to - proceed without producing any infs or NaNs that would infect the - "good" values of the corresponding block of A. */ + // Check the k dimension, which needs to be a multiple of NR. If k + // isn't a multiple of NR, we adjust it higher to satisfy the micro- + // kernel, which is expecting to perform an NR x NR triangular solve. + // This adjustment of k is consistent with what happened when B was + // packed: all of its bottom/right edges were zero-padded, and + // furthermore, the panel that stores the bottom-right corner of the + // matrix has its diagonal extended into the zero-padded region (as + // identity). This allows the trsm of that bottom-right panel to + // proceed without producing any infs or NaNs that would infect the + // "good" values of the corresponding block of A. if ( k % NR != 0 ) k += NR - ( k % NR ); - /* NOTE: We don't need to check that n is a multiple of PACKNR since we - know that the underlying buffer was already allocated to have an n - dimension that is a multiple of PACKNR, with the region between the - last column and the next multiple of NR zero-padded accordingly. */ + // NOTE: We don't need to check that n is a multiple of PACKNR since we + // know that the underlying buffer was already allocated to have an n + // dimension that is a multiple of PACKNR, with the region between the + // last column and the next multiple of NR zero-padded accordingly. thrinfo_t* thread = bli_thrinfo_sub_node( thread_par ); - /* Compute number of primary and leftover components of the m and n - dimensions. */ + // Compute number of primary and leftover components of the m and n + // dimensions. dim_t n_iter = n / NR; dim_t n_left = n % NR; @@ -198,7 +198,7 @@ void bli_trsm_rl_ker_var2 if ( n_left ) ++n_iter; if ( m_left ) ++m_iter; - /* Determine some increments used to step through A, B, and C. */ + // Determine some increments used to step through A, B, and C. inc_t rstep_a = ps_a * dt_size; inc_t cstep_b = ps_b * dt_size; @@ -206,17 +206,18 @@ void bli_trsm_rl_ker_var2 inc_t rstep_c = rs_c * MR * dt_size; inc_t cstep_c = cs_c * NR * dt_size; - /* Save the pack schemas of A and B to the auxinfo_t object. - NOTE: We swap the values for A and B since the triangular - "A" matrix is actually contained within B. */ auxinfo_t aux; + + // Save the pack schemas of A and B to the auxinfo_t object. + // NOTE: We swap the values for A and B since the triangular + // "A" matrix is actually contained within B. bli_auxinfo_set_schema_a( schema_b, &aux ); bli_auxinfo_set_schema_b( schema_a, &aux ); const char* b1 = b_cast; char* c1 = c_cast; - /* Loop over the n dimension (NR columns at a time). */ + // Loop over the n dimension (NR columns at a time). for ( dim_t jb = 0; jb < n_iter; ++jb ) { dim_t j = n_iter - 1 - jb; @@ -227,50 +228,50 @@ void bli_trsm_rl_ker_var2 const char* a1 = a_cast; char* c11 = c1 + (n_iter-1)*cstep_c; - /* Initialize our next panel of B to be the current panel of B. */ + // Initialize our next panel of B to be the current panel of B. const char* b2 = b1; - /* If the current panel of B intersects the diagonal, use a - special micro-kernel that performs a fused gemm and trsm. - If the current panel of B resides below the diagonal, use a - a regular gemm micro-kernel. Otherwise, if it is above the - diagonal, it was not packed (because it is implicitly zero) - and so we do nothing. */ + // If the current panel of B intersects the diagonal, use a + // special micro-kernel that performs a fused gemm and trsm. + // If the current panel of B resides below the diagonal, use a + // a regular gemm micro-kernel. Otherwise, if it is above the + // diagonal, it was not packed (because it is implicitly zero) + // and so we do nothing. if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) { - /* Determine the offset to and length of the panel that was packed - so we can index into the corresponding location in A. */ + // Determine the offset to and length of the panel that was packed + // so we can index into the corresponding location in A. dim_t off_b11 = bli_max( -diagoffb_j, 0 ); dim_t k_b1121 = k - off_b11; dim_t k_b11 = NR; dim_t k_b21 = k_b1121 - NR; dim_t off_b21 = off_b11 + k_b11; - /* Compute the addresses of the triangular block B11 and the - panel B21. */ + // Compute the addresses of the triangular block B11 and the + // panel B21. const char* b11 = b1; const char* b21 = b1 + k_b11 * PACKNR * dt_size; - /*b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );*/ + //b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 ); - /* Compute the panel stride for the current micro-panel. */ + // Compute the panel stride for the current micro-panel. inc_t ps_b_cur = k_b1121 * PACKNR; ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 ); ps_b_cur *= dt_size; - /* Loop over the m dimension (MR rows at a time). */ + // Loop over the m dimension (MR rows at a time). for ( dim_t i = 0; i < m_iter; ++i ) { if ( bli_trsm_my_iter_rr( i, thread ) ){ dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); - /* Compute the addresses of the A11 block and A12 panel. */ + // Compute the addresses of the A11 block and A12 panel. const char* a11 = a1 + off_b11 * PACKMR * dt_size; const char* a12 = a1 + off_b21 * PACKMR * dt_size; - /* Compute the addresses of the next panels of A and B. */ + // Compute the addresses of the next panels of A and B. const char* a2 = a1; - /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */ + //if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) { a2 = a_cast; @@ -279,9 +280,9 @@ void bli_trsm_rl_ker_var2 b2 = b_cast; } - /* Save addresses of next panels of A and B to the auxinfo_t - object. NOTE: We swap the values for A and B since the - triangular "A" matrix is actually contained within B. */ + // Save addresses of next panels of A and B to the auxinfo_t + // object. NOTE: We swap the values for A and B since the + // triangular "A" matrix is actually contained within B. bli_auxinfo_set_next_a( b2, &aux ); bli_auxinfo_set_next_b( a2, &aux ); @@ -310,16 +311,16 @@ void bli_trsm_rl_ker_var2 } else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) { - /* Loop over the m dimension (MR rows at a time). */ + // Loop over the m dimension (MR rows at a time). for ( dim_t i = 0; i < m_iter; ++i ) { if ( bli_trsm_my_iter_rr( i, thread ) ){ dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); - /* Compute the addresses of the next panels of A and B. */ + // Compute the addresses of the next panels of A and B. const char* a2 = a1; - /*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */ + //if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) if ( i + bli_thrinfo_num_threads(thread) >= m_iter ) { a2 = a_cast; @@ -328,13 +329,13 @@ void bli_trsm_rl_ker_var2 b2 = b_cast; } - /* Save addresses of next panels of A and B to the auxinfo_t - object. NOTE: We swap the values for A and B since the - triangular "A" matrix is actually contained within B. */ + // Save addresses of next panels of A and B to the auxinfo_t + // object. NOTE: We swap the values for A and B since the + // triangular "A" matrix is actually contained within B. bli_auxinfo_set_next_a( b2, &aux ); bli_auxinfo_set_next_b( a2, &aux ); - /* Invoke the gemm micro-kernel. */ + // Invoke the gemm micro-kernel. gemm_ukr ( m_cur, diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c index 6cc9a8bbb2..a05e944941 100644 --- a/frame/3/trsm/bli_trsm_ru_ker_var2.c +++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c @@ -37,11 +37,11 @@ void bli_trsm_ru_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, thrinfo_t* thread_par ) { @@ -244,7 +244,7 @@ void bli_trsm_ru_ker_var2 // block B11. const char* b01 = b1; const char* b11 = b1 + k_b01 * PACKNR * dt_size; - //b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );*/ + //b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 ); // Compute the panel stride for the current micro-panel. inc_t ps_b_cur = k_b0111 * PACKNR; diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h index a498e687e3..4d7e72b436 100644 --- a/frame/3/trsm/bli_trsm_var.h +++ b/frame/3/trsm/bli_trsm_var.h @@ -48,7 +48,7 @@ void PASTEMAC0(opname) \ const obj_t* c, \ const cntx_t* cntx, \ const cntl_t* cntl, \ - thrinfo_t* thread \ + thrinfo_t* thread_par \ ); GENPROT( trsm_blk_var1 ) diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c index 39c5372f3e..dfeefcd9d9 100644 --- a/frame/3/trsm/bli_trsm_xx_ker_var2.c +++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c @@ -43,12 +43,12 @@ static l3_var_oft vars[2][2] = void bli_trsm_xx_ker_var2 ( - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntx_t* cntx, - const cntl_t* cntl, - thrinfo_t* thread + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntx_t* cntx, + const cntl_t* cntl, + thrinfo_t* thread_par ) { dim_t side; @@ -81,7 +81,7 @@ void bli_trsm_xx_ker_var2 c, cntx, cntl, - thread + thread_par ); } diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c index 1f00537d59..3fc76b978e 100644 --- a/frame/base/bli_info.c +++ b/frame/base/bli_info.c @@ -156,7 +156,7 @@ gint_t bli_info_get_enable_hpx_as_default( void ) return 0; #endif } -gint_t bli_info_get_thread_part_jrir_slab( void ) +gint_t bli_info_get_thread_jrir_slab( void ) { #ifdef BLIS_ENABLE_JRIR_SLAB return 1; @@ -164,7 +164,7 @@ gint_t bli_info_get_thread_part_jrir_slab( void ) return 0; #endif } -gint_t bli_info_get_thread_part_jrir_rr( void ) +gint_t bli_info_get_thread_jrir_rr( void ) { #ifdef BLIS_ENABLE_JRIR_RR return 1; @@ -172,6 +172,14 @@ gint_t bli_info_get_thread_part_jrir_rr( void ) return 0; #endif } +gint_t bli_info_get_thread_jrir_tlb( void ) +{ +#ifdef BLIS_ENABLE_JRIR_TLB + return 1; +#else + return 0; +#endif +} gint_t bli_info_get_enable_memkind( void ) { #ifdef BLIS_ENABLE_MEMKIND diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h index 08a99daea9..300b3f5843 100644 --- a/frame/base/bli_info.h +++ b/frame/base/bli_info.h @@ -74,8 +74,9 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_enable_hpx( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_openmp_as_default( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_pthreads_as_default( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_hpx_as_default( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_slab( void ); -BLIS_EXPORT_BLIS gint_t bli_info_get_thread_part_jrir_rr( void ); +BLIS_EXPORT_BLIS gint_t bli_info_get_thread_jrir_slab( void ); +BLIS_EXPORT_BLIS gint_t bli_info_get_thread_jrir_rr( void ); +BLIS_EXPORT_BLIS gint_t bli_info_get_thread_jrir_tlb( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_memkind( void ); BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void ); diff --git a/frame/base/bli_prune.c b/frame/base/bli_prune.c index ebe5c23653..31c3d86d22 100644 --- a/frame/base/bli_prune.c +++ b/frame/base/bli_prune.c @@ -38,9 +38,28 @@ void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, obj_t* s, mdim_t mdim_s ) { - // If the primary object is general, it has no structure, and + // NOTE: This function is not safe to use on packed objects because it does + // not currently take into account the atomicity of the packed micropanel + // widths (i.e., the register blocksize). That is, this function will prune + // greedily, without regard to whether doing so would prune off part of a + // micropanel *which has already been packed* and "assigned" to a thread for + // inclusion in the computation. In order to be safe for use use on packed + // matrices, this function would need to prune only up to the nearest + // micropanel edge (and to the corresponding location within the secondary + // matrix), which may not coincide exactly with the diagonal offset. + if ( bli_obj_is_packed( p ) || bli_obj_is_packed( s ) ) bli_abort(); + + // If the primary object is general AND dense, it has no structure, and // therefore, no unreferenced parts. - if ( bli_obj_is_general( p ) ) return; + // NOTE: There is at least one situation where the matrix is general but + // its uplo_t value is lower or upper: gemmt. This operation benefits from + // pruning unreferenced regions the same way herk/her2k/syrk/syr2k would. + // Because of gemmt, and any future similar operations, we limit early + // returns to situations where the primary object has a dense uplo_t value + // IN ADDITION TO general structure (rather than only checking for general + // structure). + if ( bli_obj_is_general( p ) && + bli_obj_is_dense( p ) ) return; // If the primary object is BLIS_ZEROS, set the dimensions so that the // matrix is empty. This is not strictly needed but rather a minor @@ -116,21 +135,13 @@ void bli_prune_unref_mparts( obj_t* p, mdim_t mdim_p, if ( bli_is_m_dim( mdim_p ) ) q = m; else /* if ( bli_is_n_dim( mdim_p ) ) */ q = n; - // Update the affected objects in case anything changed. Notice that - // it is okay to update the dimension and diagonal offset fields of - // packed primary objects, as long as we do so in tandem with the - // secondary object to maintain conformality. This just means that - // the "ignore-able" zero region is skipped over here, rather than - // within the macro-kernel. + // Update the affected objects' diagonal offset, dimensions, and row + // and column offsets, in case anything changed. bli_obj_set_diag_offset( diagoff_p, p ); bli_obj_set_dim( mdim_p, q, p ); bli_obj_set_dim( mdim_s, q, s ); - - // Only update the affected offset fields if the object in question - // is NOT a packed object. Otherwise, bli_obj_buffer_at_off() will - // compute the wrong address within the macro-kernel object wrapper. - if ( !bli_obj_is_packed( p ) ) { bli_obj_inc_off( mdim_p, off_inc, p ); } - if ( !bli_obj_is_packed( s ) ) { bli_obj_inc_off( mdim_s, off_inc, s ); } + bli_obj_inc_off( mdim_p, off_inc, p ); + bli_obj_inc_off( mdim_s, off_inc, s ); } } diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c index 786998f23d..64124c6823 100644 --- a/frame/base/bli_rntm.c +++ b/frame/base/bli_rntm.c @@ -143,12 +143,44 @@ void bli_rntm_set_ways_for_op // kind of information is already stored in the rntm_t object. bli_rntm_factorize( m, n, k, rntm ); -#if 0 -printf( "bli_rntm_set_ways_for_op()\n" ); -bli_rntm_print( rntm ); -#endif + #if 0 + printf( "bli_rntm_set_ways_for_op()\n" ); + bli_rntm_print( rntm ); + #endif // Now modify the number of ways, if necessary, based on the operation. + + // Consider gemm (hemm, symm), gemmt (herk, her2k, syrk, syr2k), and + // trmm (trmm, trmm3). + if ( +#ifdef BLIS_ENABLE_JRIR_TLB + l3_op == BLIS_GEMM || + l3_op == BLIS_GEMMT || + l3_op == BLIS_TRMM || +#endif + FALSE + ) + { + dim_t jc = bli_rntm_jc_ways( rntm ); + dim_t pc = bli_rntm_pc_ways( rntm ); + dim_t ic = bli_rntm_ic_ways( rntm ); + dim_t jr = bli_rntm_jr_ways( rntm ); + dim_t ir = bli_rntm_ir_ways( rntm ); + + // If TLB is enabled for gemm or gemmt, redirect any ir loop parallelism + // into the jr loop. + bli_rntm_set_ways_only + ( + jc, + pc, + ic, + jr * ir, + 1, + rntm + ); + } + + // Consider trmm, trmm3, trsm. if ( l3_op == BLIS_TRMM || l3_op == BLIS_TRSM ) { diff --git a/frame/include/bli_config_macro_defs.h b/frame/include/bli_config_macro_defs.h index bf9319f4f3..9e9d47699f 100644 --- a/frame/include/bli_config_macro_defs.h +++ b/frame/include/bli_config_macro_defs.h @@ -36,6 +36,16 @@ #ifndef BLIS_CONFIG_MACRO_DEFS_H #define BLIS_CONFIG_MACRO_DEFS_H +// NOTE: This file should ONLY contain processing of macros that are set by +// configure and output into bli_config.h. Any other macro processing -- +// especially such as for those macros that are expected to be optionally +// set within a configuration's bli_family_.h header -- MUST be placed +// in bli_kernel_macro_defs.h instead. The reason: bli_arch_config.h (which +// #includes the configuration's bli_family_.h header) is #included +// much later in blis.h than this file (bli_config_macro_defs.h), and so any +// macros set in bli_family_.h would have no effect on the processing +// that happens below. + // -- INTEGER PROPERTIES ------------------------------------------------------- diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h index d273c353ab..8c0f1cb143 100644 --- a/frame/include/bli_kernel_macro_defs.h +++ b/frame/include/bli_kernel_macro_defs.h @@ -151,6 +151,7 @@ #define BLIS_FREE_USER free #endif + // -- Other system-related definitions ----------------------------------------- // Size of a virtual memory page. This is used to align blocks within the @@ -245,6 +246,7 @@ #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN 0 #endif + // -- MR and NR blocksizes (only for reference kernels) ------------------------ // The build system defines BLIS_IN_REF_KERNEL, but only when compiling diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h index 1822065dab..0865b11e99 100644 --- a/frame/include/bli_param_macro_defs.h +++ b/frame/include/bli_param_macro_defs.h @@ -927,7 +927,6 @@ BLIS_INLINE stor3_t bli_stor3_transb( stor3_t id ) } - // index-related BLIS_INLINE bool bli_is_edge_f( dim_t i, dim_t n_iter, dim_t n_left ) @@ -954,7 +953,7 @@ BLIS_INLINE bool bli_is_not_edge_b( dim_t i, dim_t n_iter, dim_t n_left ) ( i != 0 || n_left == 0 ); } -BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) +BLIS_INLINE bool bli_is_last_iter_sl( dim_t i, dim_t end_iter ) { return ( bool ) ( i == end_iter - 1 ); @@ -966,15 +965,59 @@ BLIS_INLINE bool bli_is_last_iter_rr( dim_t i, dim_t end_iter, dim_t tid, dim_t ( i == end_iter - 1 - ( ( end_iter - tid - 1 ) % nth ) ); } -BLIS_INLINE bool bli_is_last_iter( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) +BLIS_INLINE bool bli_is_last_iter_slrr( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) { #ifdef BLIS_ENABLE_JRIR_SLAB - return bli_is_last_iter_sl( i, end_iter, tid, nth ); + return bli_is_last_iter_sl( i, end_iter ); #else // BLIS_ENABLE_JRIR_RR return bli_is_last_iter_rr( i, end_iter, tid, nth ); #endif } +BLIS_INLINE bool bli_is_last_iter_l( dim_t i, dim_t end_iter, dim_t tid, dim_t nth ) +{ + return bli_is_last_iter_slrr( i, end_iter, tid, nth ); +} + +BLIS_INLINE bool bli_is_last_iter_u( doff_t diagoff, dim_t mr, dim_t nr, inc_t inc ) +{ + return bli_is_strictly_below_diag_n( diagoff + inc*mr, mr, nr ); +} + +BLIS_INLINE bool bli_is_last_iter_tlb_l( dim_t i, dim_t end_iter ) +{ + return bli_is_last_iter_sl( i, end_iter ); +} + +BLIS_INLINE bool bli_is_last_iter_tlb_u( doff_t diagoff, dim_t mr, dim_t nr ) +{ + return bli_is_strictly_below_diag_n( diagoff + 1*mr, mr, nr ); +} + +BLIS_INLINE bool bli_is_my_iter_sl( dim_t i, dim_t st, dim_t en ) +{ + return ( st <= i && i < en ); +} + +BLIS_INLINE bool bli_is_my_iter_rr( dim_t i, dim_t work_id, dim_t n_way ) +{ + return ( i % n_way == work_id % n_way ); +} + +BLIS_INLINE bool bli_is_my_iter( dim_t i, dim_t st, dim_t en, dim_t work_id, dim_t n_way ) +{ + // NOTE: This function is (as of this writing) only called from packm. + // If the structure of the cpp macros below is ever changed, make sure + // it is still consistent with that of bli_thread_range_slrr() since + // these functions are used together in packm. + +#ifdef BLIS_ENABLE_JRIR_RR + return bli_is_my_iter_rr( i, work_id, n_way ); +#else // ifdef ( _SLAB || _TLB ) + return bli_is_my_iter_sl( i, st, en ); +#endif +} + // packbuf_t-related diff --git a/frame/include/blis.h b/frame/include/blis.h index 98ebee878d..70005e57d6 100644 --- a/frame/include/blis.h +++ b/frame/include/blis.h @@ -83,6 +83,10 @@ extern "C" { // -- Threading definitions -- #include "bli_thread.h" +#include "bli_thread_range.h" +#include "bli_thread_range_slab_rr.h" +#include "bli_thread_range_tlb.h" + #include "bli_pthread.h" diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c index 4cba76b207..d41f370539 100644 --- a/frame/thread/bli_thread.c +++ b/frame/thread/bli_thread.c @@ -108,907 +108,6 @@ void bli_thread_launch // ----------------------------------------------------------------------------- -void bli_thread_range_sub - ( - const thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end - ) -{ - dim_t n_way = bli_thrinfo_n_way( thread ); - - if ( n_way == 1 ) { *start = 0; *end = n; return; } - - dim_t work_id = bli_thrinfo_work_id( thread ); - - dim_t all_start = 0; - dim_t all_end = n; - - dim_t size = all_end - all_start; - - dim_t n_bf_whole = size / bf; - dim_t n_bf_left = size % bf; - - dim_t n_bf_lo = n_bf_whole / n_way; - dim_t n_bf_hi = n_bf_whole / n_way; - - // In this function, we partition the space between all_start and - // all_end into n_way partitions, each a multiple of block_factor - // with the exception of the one partition that recieves the - // "edge" case (if applicable). - // - // Here are examples of various thread partitionings, in units of - // the block_factor, when n_way = 4. (A '+' indicates the thread - // that receives the leftover edge case (ie: n_bf_left extra - // rows/columns in its sub-range). - // (all_start ... all_end) - // n_bf_whole _left hel n_th_lo _hi thr0 thr1 thr2 thr3 - // 12 =0 f 0 4 3 3 3 3 - // 12 >0 f 0 4 3 3 3 3+ - // 13 >0 f 1 3 4 3 3 3+ - // 14 >0 f 2 2 4 4 3 3+ - // 15 >0 f 3 1 4 4 4 3+ - // 15 =0 f 3 1 4 4 4 3 - // - // 12 =0 t 4 0 3 3 3 3 - // 12 >0 t 4 0 3+ 3 3 3 - // 13 >0 t 3 1 3+ 3 3 4 - // 14 >0 t 2 2 3+ 3 4 4 - // 15 >0 t 1 3 3+ 4 4 4 - // 15 =0 t 1 3 3 4 4 4 - - // As indicated by the table above, load is balanced as equally - // as possible, even in the presence of an edge case. - - // First, we must differentiate between cases where the leftover - // "edge" case (n_bf_left) should be allocated to a thread partition - // at the low end of the index range or the high end. - - if ( handle_edge_low == FALSE ) - { - // Notice that if all threads receive the same number of - // block_factors, those threads are considered "high" and - // the "low" thread group is empty. - dim_t n_th_lo = n_bf_whole % n_way; - //dim_t n_th_hi = n_way - n_th_lo; - - // If some partitions must have more block_factors than others - // assign the slightly larger partitions to lower index threads. - if ( n_th_lo != 0 ) n_bf_lo += 1; - - // Compute the actual widths (in units of rows/columns) of - // individual threads in the low and high groups. - dim_t size_lo = n_bf_lo * bf; - dim_t size_hi = n_bf_hi * bf; - - // Precompute the starting indices of the low and high groups. - dim_t lo_start = all_start; - dim_t hi_start = all_start + n_th_lo * size_lo; - - // Compute the start and end of individual threads' ranges - // as a function of their work_ids and also the group to which - // they belong (low or high). - if ( work_id < n_th_lo ) - { - *start = lo_start + (work_id ) * size_lo; - *end = lo_start + (work_id+1) * size_lo; - } - else // if ( n_th_lo <= work_id ) - { - *start = hi_start + (work_id-n_th_lo ) * size_hi; - *end = hi_start + (work_id-n_th_lo+1) * size_hi; - - // Since the edge case is being allocated to the high - // end of the index range, we have to advance the last - // thread's end. - if ( work_id == n_way - 1 ) *end += n_bf_left; - } - } - else // if ( handle_edge_low == TRUE ) - { - // Notice that if all threads receive the same number of - // block_factors, those threads are considered "low" and - // the "high" thread group is empty. - dim_t n_th_hi = n_bf_whole % n_way; - dim_t n_th_lo = n_way - n_th_hi; - - // If some partitions must have more block_factors than others - // assign the slightly larger partitions to higher index threads. - if ( n_th_hi != 0 ) n_bf_hi += 1; - - // Compute the actual widths (in units of rows/columns) of - // individual threads in the low and high groups. - dim_t size_lo = n_bf_lo * bf; - dim_t size_hi = n_bf_hi * bf; - - // Precompute the starting indices of the low and high groups. - dim_t lo_start = all_start; - dim_t hi_start = all_start + n_th_lo * size_lo - + n_bf_left; - - // Compute the start and end of individual threads' ranges - // as a function of their work_ids and also the group to which - // they belong (low or high). - if ( work_id < n_th_lo ) - { - *start = lo_start + (work_id ) * size_lo; - *end = lo_start + (work_id+1) * size_lo; - - // Since the edge case is being allocated to the low - // end of the index range, we have to advance the - // starts/ends accordingly. - if ( work_id == 0 ) *end += n_bf_left; - else { *start += n_bf_left; - *end += n_bf_left; } - } - else // if ( n_th_lo <= work_id ) - { - *start = hi_start + (work_id-n_th_lo ) * size_hi; - *end = hi_start + (work_id-n_th_lo+1) * size_hi; - } - } -} - -siz_t bli_thread_range_l2r - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - num_t dt = bli_obj_dt( a ); - dim_t m = bli_obj_length_after_trans( a ); - dim_t n = bli_obj_width_after_trans( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - bli_thread_range_sub( thr, n, bf, - FALSE, start, end ); - - return m * ( *end - *start ); -} - -siz_t bli_thread_range_r2l - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - num_t dt = bli_obj_dt( a ); - dim_t m = bli_obj_length_after_trans( a ); - dim_t n = bli_obj_width_after_trans( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - bli_thread_range_sub( thr, n, bf, - TRUE, start, end ); - - return m * ( *end - *start ); -} - -siz_t bli_thread_range_t2b - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - num_t dt = bli_obj_dt( a ); - dim_t m = bli_obj_length_after_trans( a ); - dim_t n = bli_obj_width_after_trans( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - bli_thread_range_sub( thr, m, bf, - FALSE, start, end ); - - return n * ( *end - *start ); -} - -siz_t bli_thread_range_b2t - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - num_t dt = bli_obj_dt( a ); - dim_t m = bli_obj_length_after_trans( a ); - dim_t n = bli_obj_width_after_trans( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - bli_thread_range_sub( thr, m, bf, - TRUE, start, end ); - - return n * ( *end - *start ); -} - -// ----------------------------------------------------------------------------- - -dim_t bli_thread_range_width_l - ( - doff_t diagoff_j, - dim_t m, - dim_t n_j, - dim_t j, - dim_t n_way, - dim_t bf, - dim_t bf_left, - double area_per_thr, - bool handle_edge_low - ) -{ - dim_t width; - - // In this function, we assume that we are somewhere in the process of - // partitioning an m x n lower-stored region (with arbitrary diagonal - // offset) n_ways along the n dimension (into column panels). The value - // j identifies the left-to-right subpartition index (from 0 to n_way-1) - // of the subpartition whose width we are about to compute using the - // area per thread determined by the caller. n_j is the number of - // columns in the remaining region of the matrix being partitioned, - // and diagoff_j is that region's diagonal offset. - - // If this is the last subpartition, the width is simply equal to n_j. - // Note that this statement handles cases where the "edge case" (if - // one exists) is assigned to the high end of the index range (ie: - // handle_edge_low == FALSE). - if ( j == n_way - 1 ) return n_j; - - // At this point, we know there are at least two subpartitions left. - // We also know that IF the submatrix contains a completely dense - // rectangular submatrix, it will occur BEFORE the triangular (or - // trapezoidal) part. - - // Here, we implement a somewhat minor load balancing optimization - // that ends up getting employed only for relatively small matrices. - // First, recall that all subpartition widths will be some multiple - // of the blocking factor bf, except perhaps either the first or last - // subpartition, which will receive the edge case, if it exists. - // Also recall that j represents the current thread (or thread group, - // or "caucus") for which we are computing a subpartition width. - // If n_j is sufficiently small that we can only allocate bf columns - // to each of the remaining threads, then we set the width to bf. We - // do not allow the subpartition width to be less than bf, so, under - // some conditions, if n_j is small enough, some of the reamining - // threads may not get any work. For the purposes of this lower bound - // on work (ie: width >= bf), we allow the edge case to count as a - // "full" set of bf columns. - { - dim_t n_j_bf = n_j / bf + ( bf_left > 0 ? 1 : 0 ); - - if ( n_j_bf <= n_way - j ) - { - if ( j == 0 && handle_edge_low ) - width = ( bf_left > 0 ? bf_left : bf ); - else - width = bf; - - // Make sure that the width does not exceed n_j. This would - // occur if and when n_j_bf < n_way - j; that is, when the - // matrix being partitioned is sufficiently small relative to - // n_way such that there is not even enough work for every - // (remaining) thread to get bf (or bf_left) columns. The - // net effect of this safeguard is that some threads may get - // assigned empty ranges (ie: no work), which of course must - // happen in some situations. - if ( width > n_j ) width = n_j; - - return width; - } - } - - // This block computes the width assuming that we are entirely within - // a dense rectangle that precedes the triangular (or trapezoidal) - // part. - { - // First compute the width of the current panel under the - // assumption that the diagonal offset would not intersect. - width = ( dim_t )bli_round( ( double )area_per_thr / ( double )m ); - - // Adjust the width, if necessary. Specifically, we may need - // to allocate the edge case to the first subpartition, if - // requested; otherwise, we just need to ensure that the - // subpartition is a multiple of the blocking factor. - if ( j == 0 && handle_edge_low ) - { - if ( width % bf != bf_left ) width += bf_left - ( width % bf ); - } - else // if interior case - { - // Round up to the next multiple of the blocking factor. - //if ( width % bf != 0 ) width += bf - ( width % bf ); - // Round to the nearest multiple of the blocking factor. - if ( width % bf != 0 ) width = bli_round_to_mult( width, bf ); - } - } - - // We need to recompute width if the panel, according to the width - // as currently computed, would intersect the diagonal. - if ( diagoff_j < width ) - { - dim_t offm_inc, offn_inc; - - // Prune away the unstored region above the diagonal, if it exists. - // Note that the entire region was pruned initially, so we know that - // we don't need to try to prune the right side. (Also, we discard - // the offset deltas since we don't need to actually index into the - // subpartition.) - bli_prune_unstored_region_top_l( &diagoff_j, &m, &n_j, &offm_inc ); - //bli_prune_unstored_region_right_l( &diagoff_j, &m, &n_j, &offn_inc ); - - // We don't need offm_inc, offn_inc here. These statements should - // prevent compiler warnings. - ( void )offm_inc; - ( void )offn_inc; - - // Prepare to solve a quadratic equation to find the width of the - // current (jth) subpartition given the m dimension, diagonal offset, - // and area. - // NOTE: We know that the +/- in the quadratic formula must be a + - // here because we know that the desired solution (the subpartition - // width) will be smaller than (m + diagoff), not larger. If you - // don't believe me, draw a picture! - const double a = -0.5; - const double b = ( double )m + ( double )diagoff_j + 0.5; - const double c = -0.5 * ( ( double )diagoff_j * - ( ( double )diagoff_j + 1.0 ) - ) - area_per_thr; - const double r = b * b - 4.0 * a * c; - - // If the quadratic solution is not imaginary, round it and use that - // as our width, but make sure it didn't round to zero. Otherwise, - // discard the quadratic solution and leave width, as previously - // computed, unchanged. - if ( r >= 0.0 ) - { - const double x = ( -b + sqrt( r ) ) / ( 2.0 * a ); - - width = ( dim_t )bli_round( x ); - if ( width == 0 ) width = 1; - } - - // Adjust the width, if necessary. - if ( j == 0 && handle_edge_low ) - { - if ( width % bf != bf_left ) width += bf_left - ( width % bf ); - } - else // if interior case - { - // Round up to the next multiple of the blocking factor. - //if ( width % bf != 0 ) width += bf - ( width % bf ); - // Round to the nearest multiple of the blocking factor. - if ( width % bf != 0 ) width = bli_round_to_mult( width, bf ); - } - } - - // Make sure that the width, after being adjusted, does not cause the - // subpartition to exceed n_j. - if ( width > n_j ) width = n_j; - - return width; -} - -siz_t bli_find_area_trap_l - ( - dim_t m, - dim_t n, - doff_t diagoff - ) -{ - dim_t offm_inc = 0; - dim_t offn_inc = 0; - double tri_area; - double area; - - // Prune away any rectangular region above where the diagonal - // intersects the left edge of the subpartition, if it exists. - bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc ); - - // Prune away any rectangular region to the right of where the - // diagonal intersects the bottom edge of the subpartition, if - // it exists. (This shouldn't ever be needed, since the caller - // would presumably have already performed rightward pruning, - // but it's here just in case.) - bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc ); - - ( void )offm_inc; - ( void )offn_inc; - - // Compute the area of the empty triangle so we can subtract it - // from the area of the rectangle that bounds the subpartition. - if ( bli_intersects_diag_n( diagoff, m, n ) ) - { - double tri_dim = ( double )( n - diagoff - 1 ); - tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; - } - else - { - // If the diagonal does not intersect the trapezoid, then - // we can compute the area as a simple rectangle. - tri_area = 0.0; - } - - area = ( double )m * ( double )n - tri_area; - - return ( siz_t )area; -} - -// ----------------------------------------------------------------------------- - -siz_t bli_thread_range_weighted_sub - ( - const thrinfo_t* thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* j_start_thr, - dim_t* j_end_thr - ) -{ - dim_t n_way = bli_thrinfo_n_way( thread ); - dim_t my_id = bli_thrinfo_work_id( thread ); - - dim_t bf_left = n % bf; - - dim_t j; - - dim_t off_j; - doff_t diagoff_j; - dim_t n_left; - - dim_t width_j; - - dim_t offm_inc, offn_inc; - - double tri_dim, tri_area; - double area_total, area_per_thr; - - siz_t area = 0; - - // In this function, we assume that the caller has already determined - // that (a) the diagonal intersects the submatrix, and (b) the submatrix - // is either lower- or upper-stored. - - if ( bli_is_lower( uplo ) ) - { - // Prune away the unstored region above the diagonal, if it exists, - // and then to the right of where the diagonal intersects the bottom, - // if it exists. (Also, we discard the offset deltas since we don't - // need to actually index into the subpartition.) - bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc ); - bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc ); - - // We don't need offm_inc, offn_inc here. These statements should - // prevent compiler warnings. - ( void )offm_inc; - ( void )offn_inc; - - // Now that pruning has taken place, we know that diagoff >= 0. - - // Compute the total area of the submatrix, accounting for the - // location of the diagonal, and divide it by the number of ways - // of parallelism. - tri_dim = ( double )( n - diagoff - 1 ); - tri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; - area_total = ( double )m * ( double )n - tri_area; - area_per_thr = area_total / ( double )n_way; - - // Initialize some variables prior to the loop: the offset to the - // current subpartition, the remainder of the n dimension, and - // the diagonal offset of the current subpartition. - off_j = 0; - diagoff_j = diagoff; - n_left = n; - - // Iterate over the subpartition indices corresponding to each - // thread/caucus participating in the n_way parallelism. - for ( j = 0; j < n_way; ++j ) - { - // Compute the width of the jth subpartition, taking the - // current diagonal offset into account, if needed. - width_j = - bli_thread_range_width_l - ( - diagoff_j, m, n_left, - j, n_way, - bf, bf_left, - area_per_thr, - handle_edge_low - ); - - // If the current thread belongs to caucus j, this is his - // subpartition. So we compute the implied index range and - // end our search. - if ( j == my_id ) - { - *j_start_thr = off_j; - *j_end_thr = off_j + width_j; - - area = bli_find_area_trap_l( m, width_j, diagoff_j ); - - break; - } - - // Shift the current subpartition's starting and diagonal offsets, - // as well as the remainder of the n dimension, according to the - // computed width, and then iterate to the next subpartition. - off_j += width_j; - diagoff_j -= width_j; - n_left -= width_j; - } - } - else // if ( bli_is_upper( uplo ) ) - { - // Express the upper-stored case in terms of the lower-stored case. - - // First, we convert the upper-stored trapezoid to an equivalent - // lower-stored trapezoid by rotating it 180 degrees. - bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); - - // Now that the trapezoid is "flipped" in the n dimension, negate - // the bool that encodes whether to handle the edge case at the - // low (or high) end of the index range. - bli_toggle_bool( &handle_edge_low ); - - // Compute the appropriate range for the rotated trapezoid. - area = bli_thread_range_weighted_sub - ( - thread, diagoff, uplo, m, n, bf, - handle_edge_low, - j_start_thr, j_end_thr - ); - - // Reverse the indexing basis for the subpartition ranges so that - // the indices, relative to left-to-right iteration through the - // unrotated upper-stored trapezoid, map to the correct columns - // (relative to the diagonal). This amounts to subtracting the - // range from n. - bli_reverse_index_direction( n, j_start_thr, j_end_thr ); - } - - return area; -} - -siz_t bli_thread_range_mdim - ( - dir_t direct, - const thrinfo_t* thr, - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntl_t* cntl, - const cntx_t* cntx, - dim_t* start, - dim_t* end - ) -{ - bszid_t bszid = bli_cntl_bszid( cntl ); - opid_t family = bli_cntl_family( cntl ); - - // This is part of trsm's current implementation, whereby right side - // cases are implemented in left-side micro-kernels, which requires - // we swap the usage of the register blocksizes for the purposes of - // packing A and B. - if ( family == BLIS_TRSM ) - { - if ( bli_obj_root_is_triangular( a ) ) bszid = BLIS_MR; - else bszid = BLIS_NR; - } - - const blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); - const obj_t* x; - bool use_weighted; - - // Use the operation family to choose the one of the two matrices - // being partitioned that potentially has structure, and also to - // decide whether or not we need to use weighted range partitioning. - // NOTE: It's important that we use non-weighted range partitioning - // for hemm and symm (ie: the gemm family) because the weighted - // function will mistakenly skip over unstored regions of the - // structured matrix, even though they represent part of that matrix - // that will be dense and full (after packing). - if ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; } - else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } - else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE; } - else /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; } - - if ( use_weighted ) - { - if ( direct == BLIS_FWD ) - return bli_thread_range_weighted_t2b( thr, x, bmult, start, end ); - else - return bli_thread_range_weighted_b2t( thr, x, bmult, start, end ); - } - else - { - if ( direct == BLIS_FWD ) - return bli_thread_range_t2b( thr, x, bmult, start, end ); - else - return bli_thread_range_b2t( thr, x, bmult, start, end ); - } -} - -siz_t bli_thread_range_ndim - ( - dir_t direct, - const thrinfo_t* thr, - const obj_t* a, - const obj_t* b, - const obj_t* c, - const cntl_t* cntl, - const cntx_t* cntx, - dim_t* start, - dim_t* end - ) -{ - bszid_t bszid = bli_cntl_bszid( cntl ); - opid_t family = bli_cntl_family( cntl ); - - // This is part of trsm's current implementation, whereby right side - // cases are implemented in left-side micro-kernels, which requires - // we swap the usage of the register blocksizes for the purposes of - // packing A and B. - if ( family == BLIS_TRSM ) - { - if ( bli_obj_root_is_triangular( b ) ) bszid = BLIS_MR; - else bszid = BLIS_NR; - } - - const blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); - const obj_t* x; - bool use_weighted; - - // Use the operation family to choose the one of the two matrices - // being partitioned that potentially has structure, and also to - // decide whether or not we need to use weighted range partitioning. - // NOTE: It's important that we use non-weighted range partitioning - // for hemm and symm (ie: the gemm family) because the weighted - // function will mistakenly skip over unstored regions of the - // structured matrix, even though they represent part of that matrix - // that will be dense and full (after packing). - if ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; } - else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } - else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE; } - else /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; } - - if ( use_weighted ) - { - if ( direct == BLIS_FWD ) - return bli_thread_range_weighted_l2r( thr, x, bmult, start, end ); - else - return bli_thread_range_weighted_r2l( thr, x, bmult, start, end ); - } - else - { - if ( direct == BLIS_FWD ) - return bli_thread_range_l2r( thr, x, bmult, start, end ); - else - return bli_thread_range_r2l( thr, x, bmult, start, end ); - } -} - -siz_t bli_thread_range_weighted_l2r - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - siz_t area; - - // This function assigns area-weighted ranges in the n dimension - // where the total range spans 0 to n-1 with 0 at the left end and - // n-1 at the right end. - - if ( bli_obj_intersects_diag( a ) && - bli_obj_is_upper_or_lower( a ) ) - { - num_t dt = bli_obj_dt( a ); - doff_t diagoff = bli_obj_diag_offset( a ); - uplo_t uplo = bli_obj_uplo( a ); - dim_t m = bli_obj_length( a ); - dim_t n = bli_obj_width( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - // Support implicit transposition. - if ( bli_obj_has_trans( a ) ) - { - bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); - } - - area = - bli_thread_range_weighted_sub - ( - thr, diagoff, uplo, m, n, bf, - FALSE, start, end - ); - } - else // if dense or zeros - { - area = bli_thread_range_l2r - ( - thr, a, bmult, - start, end - ); - } - - return area; -} - -siz_t bli_thread_range_weighted_r2l - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - siz_t area; - - // This function assigns area-weighted ranges in the n dimension - // where the total range spans 0 to n-1 with 0 at the right end and - // n-1 at the left end. - - if ( bli_obj_intersects_diag( a ) && - bli_obj_is_upper_or_lower( a ) ) - { - num_t dt = bli_obj_dt( a ); - doff_t diagoff = bli_obj_diag_offset( a ); - uplo_t uplo = bli_obj_uplo( a ); - dim_t m = bli_obj_length( a ); - dim_t n = bli_obj_width( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - // Support implicit transposition. - if ( bli_obj_has_trans( a ) ) - { - bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); - } - - bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); - - area = - bli_thread_range_weighted_sub - ( - thr, diagoff, uplo, m, n, bf, - TRUE, start, end - ); - } - else // if dense or zeros - { - area = bli_thread_range_r2l - ( - thr, a, bmult, - start, end - ); - } - - return area; -} - -siz_t bli_thread_range_weighted_t2b - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - siz_t area; - - // This function assigns area-weighted ranges in the m dimension - // where the total range spans 0 to m-1 with 0 at the top end and - // m-1 at the bottom end. - - if ( bli_obj_intersects_diag( a ) && - bli_obj_is_upper_or_lower( a ) ) - { - num_t dt = bli_obj_dt( a ); - doff_t diagoff = bli_obj_diag_offset( a ); - uplo_t uplo = bli_obj_uplo( a ); - dim_t m = bli_obj_length( a ); - dim_t n = bli_obj_width( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - // Support implicit transposition. - if ( bli_obj_has_trans( a ) ) - { - bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); - } - - bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); - - area = - bli_thread_range_weighted_sub - ( - thr, diagoff, uplo, m, n, bf, - FALSE, start, end - ); - } - else // if dense or zeros - { - area = bli_thread_range_t2b - ( - thr, a, bmult, - start, end - ); - } - - return area; -} - -siz_t bli_thread_range_weighted_b2t - ( - const thrinfo_t* thr, - const obj_t* a, - const blksz_t* bmult, - dim_t* start, - dim_t* end - ) -{ - siz_t area; - - // This function assigns area-weighted ranges in the m dimension - // where the total range spans 0 to m-1 with 0 at the bottom end and - // m-1 at the top end. - - if ( bli_obj_intersects_diag( a ) && - bli_obj_is_upper_or_lower( a ) ) - { - num_t dt = bli_obj_dt( a ); - doff_t diagoff = bli_obj_diag_offset( a ); - uplo_t uplo = bli_obj_uplo( a ); - dim_t m = bli_obj_length( a ); - dim_t n = bli_obj_width( a ); - dim_t bf = bli_blksz_get_def( dt, bmult ); - - // Support implicit transposition. - if ( bli_obj_has_trans( a ) ) - { - bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); - } - - bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); - - bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); - - area = bli_thread_range_weighted_sub - ( - thr, diagoff, uplo, m, n, bf, - TRUE, start, end - ); - } - else // if dense or zeros - { - area = bli_thread_range_b2t - ( - thr, a, bmult, - start, end - ); - } - - return area; -} - -// ----------------------------------------------------------------------------- - void bli_prime_factorization( dim_t n, bli_prime_factors_t* factors ) { factors->n = n; diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h index e61fc8b892..5002672dc4 100644 --- a/frame/thread/bli_thread.h +++ b/frame/thread/bli_thread.h @@ -56,6 +56,8 @@ typedef void (*thread_func_t)( thrcomm_t* gl_comm, dim_t tid, const void* params void bli_thread_init( void ); void bli_thread_finalize( void ); +// ----------------------------------------------------------------------------- + BLIS_EXPORT_BLIS void bli_thread_launch ( timpl_t ti, @@ -64,91 +66,6 @@ BLIS_EXPORT_BLIS void bli_thread_launch const void* params ); -// Thread range-related prototypes. - -BLIS_EXPORT_BLIS void bli_thread_range_sub - ( - const thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end - ); - -#undef GENPROT -#define GENPROT( opname ) \ -\ -siz_t PASTEMAC0( opname ) \ - ( \ - dir_t direct, \ - const thrinfo_t* thr, \ - const obj_t* a, \ - const obj_t* b, \ - const obj_t* c, \ - const cntl_t* cntl, \ - const cntx_t* cntx, \ - dim_t* start, \ - dim_t* end \ - ); - -GENPROT( thread_range_mdim ) -GENPROT( thread_range_ndim ) - -#undef GENPROT -#define GENPROT( opname ) \ -\ -siz_t PASTEMAC0( opname ) \ - ( \ - const thrinfo_t* thr, \ - const obj_t* a, \ - const blksz_t* bmult, \ - dim_t* start, \ - dim_t* end \ - ); - -GENPROT( thread_range_l2r ) -GENPROT( thread_range_r2l ) -GENPROT( thread_range_t2b ) -GENPROT( thread_range_b2t ) - -GENPROT( thread_range_weighted_l2r ) -GENPROT( thread_range_weighted_r2l ) -GENPROT( thread_range_weighted_t2b ) -GENPROT( thread_range_weighted_b2t ) - - -dim_t bli_thread_range_width_l - ( - doff_t diagoff_j, - dim_t m, - dim_t n_j, - dim_t j, - dim_t n_way, - dim_t bf, - dim_t bf_left, - double area_per_thr, - bool handle_edge_low - ); -siz_t bli_find_area_trap_l - ( - dim_t m, - dim_t n, - doff_t diagoff - ); -siz_t bli_thread_range_weighted_sub - ( - const thrinfo_t* thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* j_start_thr, - dim_t* j_end_thr - ); - // ----------------------------------------------------------------------------- // Factorization and partitioning prototypes @@ -212,98 +129,5 @@ BLIS_EXPORT_BLIS void bli_thread_set_thread_impl( timpl_t ti ); void bli_thread_init_rntm_from_env( rntm_t* rntm ); -// ----------------------------------------------------------------------------- - -BLIS_INLINE void bli_thread_range_jrir_rr - ( - const thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc - ) -{ - // Use interleaved partitioning of jr/ir loops. - *start = bli_thrinfo_work_id( thread ); - *inc = bli_thrinfo_n_way( thread ); - *end = n; -} - -BLIS_INLINE void bli_thread_range_jrir_sl - ( - const thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc - ) -{ - // Use contiguous slab partitioning of jr/ir loops. - bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); - *inc = 1; -} - -BLIS_INLINE void bli_thread_range_jrir - ( - const thrinfo_t* thread, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc - ) -{ - // Define a general-purpose version of bli_thread_range_jrir() whose - // definition depends on whether slab or round-robin partitioning was - // requested at configure-time. -#ifdef BLIS_ENABLE_JRIR_SLAB - bli_thread_range_jrir_sl( thread, n, bf, handle_edge_low, start, end, inc ); -#else - bli_thread_range_jrir_rr( thread, n, bf, handle_edge_low, start, end, inc ); -#endif -} - -#if 0 -BLIS_INLINE void bli_thread_range_weighted_jrir - ( - thrinfo_t* thread, - doff_t diagoff, - uplo_t uplo, - dim_t m, - dim_t n, - dim_t bf, - bool handle_edge_low, - dim_t* start, - dim_t* end, - dim_t* inc - ) -{ -#ifdef BLIS_ENABLE_JRIR_SLAB - - // Use contiguous slab partitioning for jr/ir loops. - bli_thread_range_weighted_sub( thread, diagoff, uplo, m, n, bf, - handle_edge_low, start, end ); - - *start = *start / bf; *inc = 1; - - if ( *end % bf ) *end = *end / bf + 1; - else *end = *end / bf; - -#else - // Use interleaved partitioning of jr/ir loops. - *start = bli_thrinfo_work_id( thread ); - *inc = bli_thrinfo_n_way( thread ); - *end = n; - -#endif -} #endif - -#endif - diff --git a/frame/thread/bli_thread_range.c b/frame/thread/bli_thread_range.c new file mode 100644 index 0000000000..a28e529b02 --- /dev/null +++ b/frame/thread/bli_thread_range.c @@ -0,0 +1,1121 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_thread_range_sub + ( + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end + ) +{ + dim_t n_way = bli_thrinfo_n_way( thread ); + + if ( n_way == 1 ) { *start = 0; *end = n; return; } + + dim_t work_id = bli_thrinfo_work_id( thread ); + + dim_t all_start = 0; + dim_t all_end = n; + + dim_t size = all_end - all_start; + + dim_t n_bf_whole = size / bf; + dim_t n_bf_left = size % bf; + + dim_t n_bf_lo = n_bf_whole / n_way; + dim_t n_bf_hi = n_bf_whole / n_way; + + // In this function, we partition the space between all_start and + // all_end into n_way partitions, each a multiple of block_factor + // with the exception of the one partition that recieves the + // "edge" case (if applicable). + // + // Here are examples of various thread partitionings, in units of + // the block_factor, when n_way = 4. (A '+' indicates the thread + // that receives the leftover edge case (ie: n_bf_left extra + // rows/columns in its sub-range). + // (all_start ... all_end) + // n_bf_whole _left hel n_th_lo _hi thr0 thr1 thr2 thr3 + // 12 =0 f 0 4 3 3 3 3 + // 12 >0 f 0 4 3 3 3 3+ + // 13 >0 f 1 3 4 3 3 3+ + // 14 >0 f 2 2 4 4 3 3+ + // 15 >0 f 3 1 4 4 4 3+ + // 15 =0 f 3 1 4 4 4 3 + // + // 12 =0 t 4 0 3 3 3 3 + // 12 >0 t 4 0 3+ 3 3 3 + // 13 >0 t 3 1 3+ 3 3 4 + // 14 >0 t 2 2 3+ 3 4 4 + // 15 >0 t 1 3 3+ 4 4 4 + // 15 =0 t 1 3 3 4 4 4 + + // As indicated by the table above, load is balanced as equally + // as possible, even in the presence of an edge case. + + // First, we must differentiate between cases where the leftover + // "edge" case (n_bf_left) should be allocated to a thread partition + // at the low end of the index range or the high end. + + if ( handle_edge_low == FALSE ) + { + // Notice that if all threads receive the same number of + // block_factors, those threads are considered "high" and + // the "low" thread group is empty. + dim_t n_th_lo = n_bf_whole % n_way; + //dim_t n_th_hi = n_way - n_th_lo; + + // If some partitions must have more block_factors than others + // assign the slightly larger partitions to lower index threads. + if ( n_th_lo != 0 ) n_bf_lo += 1; + + // Compute the actual widths (in units of rows/columns) of + // individual threads in the low and high groups. + dim_t size_lo = n_bf_lo * bf; + dim_t size_hi = n_bf_hi * bf; + + // Precompute the starting indices of the low and high groups. + dim_t lo_start = all_start; + dim_t hi_start = all_start + n_th_lo * size_lo; + + // Compute the start and end of individual threads' ranges + // as a function of their work_ids and also the group to which + // they belong (low or high). + if ( work_id < n_th_lo ) + { + *start = lo_start + (work_id ) * size_lo; + *end = lo_start + (work_id+1) * size_lo; + } + else // if ( n_th_lo <= work_id ) + { + *start = hi_start + (work_id-n_th_lo ) * size_hi; + *end = hi_start + (work_id-n_th_lo+1) * size_hi; + + // Since the edge case is being allocated to the high + // end of the index range, we have to advance the last + // thread's end. + if ( work_id == n_way - 1 ) *end += n_bf_left; + } + } + else // if ( handle_edge_low == TRUE ) + { + // Notice that if all threads receive the same number of + // block_factors, those threads are considered "low" and + // the "high" thread group is empty. + dim_t n_th_hi = n_bf_whole % n_way; + dim_t n_th_lo = n_way - n_th_hi; + + // If some partitions must have more block_factors than others + // assign the slightly larger partitions to higher index threads. + if ( n_th_hi != 0 ) n_bf_hi += 1; + + // Compute the actual widths (in units of rows/columns) of + // individual threads in the low and high groups. + dim_t size_lo = n_bf_lo * bf; + dim_t size_hi = n_bf_hi * bf; + + // Precompute the starting indices of the low and high groups. + dim_t lo_start = all_start; + dim_t hi_start = all_start + n_th_lo * size_lo + + n_bf_left; + + // Compute the start and end of individual threads' ranges + // as a function of their work_ids and also the group to which + // they belong (low or high). + if ( work_id < n_th_lo ) + { + *start = lo_start + (work_id ) * size_lo; + *end = lo_start + (work_id+1) * size_lo; + + // Since the edge case is being allocated to the low + // end of the index range, we have to advance the + // starts/ends accordingly. + if ( work_id == 0 ) *end += n_bf_left; + else { *start += n_bf_left; + *end += n_bf_left; } + } + else // if ( n_th_lo <= work_id ) + { + *start = hi_start + (work_id-n_th_lo ) * size_hi; + *end = hi_start + (work_id-n_th_lo+1) * size_hi; + } + } +} + +// ----------------------------------------------------------------------------- + +siz_t bli_thread_range_l2r + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + num_t dt = bli_obj_dt( a ); + dim_t m = bli_obj_length_after_trans( a ); + dim_t n = bli_obj_width_after_trans( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + bli_thread_range_sub( thr, n, bf, + FALSE, start, end ); + + return m * ( *end - *start ); +} + +siz_t bli_thread_range_r2l + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + num_t dt = bli_obj_dt( a ); + dim_t m = bli_obj_length_after_trans( a ); + dim_t n = bli_obj_width_after_trans( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + bli_thread_range_sub( thr, n, bf, + TRUE, start, end ); + + return m * ( *end - *start ); +} + +siz_t bli_thread_range_t2b + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + num_t dt = bli_obj_dt( a ); + dim_t m = bli_obj_length_after_trans( a ); + dim_t n = bli_obj_width_after_trans( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + bli_thread_range_sub( thr, m, bf, + FALSE, start, end ); + + return n * ( *end - *start ); +} + +siz_t bli_thread_range_b2t + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + num_t dt = bli_obj_dt( a ); + dim_t m = bli_obj_length_after_trans( a ); + dim_t n = bli_obj_width_after_trans( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + bli_thread_range_sub( thr, m, bf, + TRUE, start, end ); + + return n * ( *end - *start ); +} + +// ----------------------------------------------------------------------------- + +dim_t bli_thread_range_width_l + ( + doff_t diagoff_j, + dim_t m, + dim_t n_j, + dim_t j, + dim_t n_way, + dim_t bf, + dim_t bf_left, + double area_per_thr, + bool handle_edge_low + ) +{ + dim_t width; + + // In this function, we assume that we are somewhere in the process of + // partitioning an m x n lower-stored region (with arbitrary diagonal + // offset) n_ways along the n dimension (into column panels). The value + // j identifies the left-to-right subpartition index (from 0 to n_way-1) + // of the subpartition whose width we are about to compute using the + // area per thread determined by the caller. n_j is the number of + // columns in the remaining region of the matrix being partitioned, + // and diagoff_j is that region's diagonal offset. + + // If this is the last subpartition, the width is simply equal to n_j. + // Note that this statement handles cases where the "edge case" (if + // one exists) is assigned to the high end of the index range (ie: + // handle_edge_low == FALSE). + if ( j == n_way - 1 ) return n_j; + + // At this point, we know there are at least two subpartitions left. + // We also know that IF the submatrix contains a completely dense + // rectangular submatrix, it will occur BEFORE the triangular (or + // trapezoidal) part. + + // Here, we implement a somewhat minor load balancing optimization + // that ends up getting employed only for relatively small matrices. + // First, recall that all subpartition widths will be some multiple + // of the blocking factor bf, except perhaps either the first or last + // subpartition, which will receive the edge case, if it exists. + // Also recall that j represents the current thread (or thread group, + // or "caucus") for which we are computing a subpartition width. + // If n_j is sufficiently small that we can only allocate bf columns + // to each of the remaining threads, then we set the width to bf. We + // do not allow the subpartition width to be less than bf, so, under + // some conditions, if n_j is small enough, some of the reamining + // threads may not get any work. For the purposes of this lower bound + // on work (ie: width >= bf), we allow the edge case to count as a + // "full" set of bf columns. + { + dim_t n_j_bf = n_j / bf + ( bf_left > 0 ? 1 : 0 ); + + if ( n_j_bf <= n_way - j ) + { + if ( j == 0 && handle_edge_low ) + width = ( bf_left > 0 ? bf_left : bf ); + else + width = bf; + + // Make sure that the width does not exceed n_j. This would + // occur if and when n_j_bf < n_way - j; that is, when the + // matrix being partitioned is sufficiently small relative to + // n_way such that there is not even enough work for every + // (remaining) thread to get bf (or bf_left) columns. The + // net effect of this safeguard is that some threads may get + // assigned empty ranges (ie: no work), which of course must + // happen in some situations. + if ( width > n_j ) width = n_j; + + return width; + } + } + + // This block computes the width assuming that we are entirely within + // a dense rectangle that precedes the triangular (or trapezoidal) + // part. + { + // First compute the width of the current panel under the + // assumption that the diagonal offset would not intersect. + width = ( dim_t )bli_round( ( double )area_per_thr / ( double )m ); + + // Adjust the width, if necessary. Specifically, we may need + // to allocate the edge case to the first subpartition, if + // requested; otherwise, we just need to ensure that the + // subpartition is a multiple of the blocking factor. + if ( j == 0 && handle_edge_low ) + { + if ( width % bf != bf_left ) width += bf_left - ( width % bf ); + } + else // if interior case + { + // Round up to the next multiple of the blocking factor. + //if ( width % bf != 0 ) width += bf - ( width % bf ); + // Round to the nearest multiple of the blocking factor. + if ( width % bf != 0 ) width = bli_round_to_mult( width, bf ); + } + } + + // We need to recompute width if the panel, according to the width + // as currently computed, would intersect the diagonal. + if ( diagoff_j < width ) + { + dim_t offm_inc, offn_inc; + + // Prune away the unstored region above the diagonal, if it exists. + // Note that the entire region was pruned initially, so we know that + // we don't need to try to prune the right side. (Also, we discard + // the offset deltas since we don't need to actually index into the + // subpartition.) + bli_prune_unstored_region_top_l( &diagoff_j, &m, &n_j, &offm_inc ); + //bli_prune_unstored_region_right_l( &diagoff_j, &m, &n_j, &offn_inc ); + + // We don't need offm_inc, offn_inc here. These statements should + // prevent compiler warnings. + ( void )offm_inc; + ( void )offn_inc; + + // Prepare to solve a quadratic equation to find the width of the + // current (jth) subpartition given the m dimension, diagonal offset, + // and area. + // NOTE: We know that the +/- in the quadratic formula must be a + + // here because we know that the desired solution (the subpartition + // width) will be smaller than (m + diagoff), not larger. If you + // don't believe me, draw a picture! + const double a = -0.5; + const double b = ( double )m + ( double )diagoff_j + 0.5; + const double c = -0.5 * ( ( double )diagoff_j * + ( ( double )diagoff_j + 1.0 ) + ) - area_per_thr; + const double r = b * b - 4.0 * a * c; + + // If the quadratic solution is not imaginary, round it and use that + // as our width (but make sure it didn't round to zero). Otherwise, + // discard the quadratic solution and leave width, as previously + // computed, unchanged. + if ( r >= 0.0 ) + { + const double x = ( -b + sqrt( r ) ) / ( 2.0 * a ); + + width = ( dim_t )bli_round( x ); + if ( width == 0 ) width = 1; + } + + // Adjust the width, if necessary. + if ( j == 0 && handle_edge_low ) + { + if ( width % bf != bf_left ) width += bf_left - ( width % bf ); + } + else // if interior case + { + // Round up to the next multiple of the blocking factor. + //if ( width % bf != 0 ) width += bf - ( width % bf ); + // Round to the nearest multiple of the blocking factor. + if ( width % bf != 0 ) width = bli_round_to_mult( width, bf ); + } + } + + // Make sure that the width, after being adjusted, does not cause the + // subpartition to exceed n_j. + if ( width > n_j ) width = n_j; + + return width; +} + +siz_t bli_find_area_trap_l + ( + doff_t diagoff, + dim_t m, + dim_t n, + dim_t bf + ) +{ + dim_t offm_inc = 0; + dim_t offn_inc = 0; + double utri_area; + double blktri_area; + + // Prune away any rectangular region above where the diagonal + // intersects the left edge of the subpartition, if it exists. + bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc ); + + // Prune away any rectangular region to the right of where the + // diagonal intersects the bottom edge of the subpartition, if + // it exists. (This shouldn't ever be needed, since the caller + // would presumably have already performed rightward pruning, + // but it's here just in case.) + //bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc ); + + ( void )offm_inc; + ( void )offn_inc; + + // Compute the area of the empty triangle so we can subtract it + // from the area of the rectangle that bounds the subpartition. + if ( bli_intersects_diag_n( diagoff, m, n ) ) + { + double tri_dim = ( double )( n - diagoff - 1 ); + tri_dim = bli_min( tri_dim, m - 1 ); + + utri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; + blktri_area = tri_dim * ( bf - 1.0 ) / 2.0; + } + else + { + // If the diagonal does not intersect the trapezoid, then + // we can compute the area as a simple rectangle. + utri_area = 0.0; + blktri_area = 0.0; + } + + double area = ( double )m * ( double )n - utri_area + blktri_area; + + return ( siz_t )area; +} + +// ----------------------------------------------------------------------------- + +siz_t bli_thread_range_weighted_sub + ( + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + uplo_t uplo_orig, + dim_t m, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* j_start_thr, + dim_t* j_end_thr + ) +{ + dim_t n_way = bli_thrinfo_n_way( thread ); + dim_t my_id = bli_thrinfo_work_id( thread ); + + dim_t bf_left = n % bf; + + dim_t offm_inc, offn_inc; + + siz_t area = 0; + + // In this function, we assume that the caller has already determined + // that (a) the diagonal intersects the submatrix, and (b) the submatrix + // is either lower- or upper-stored. + + if ( bli_is_lower( uplo ) ) + { + #if 0 + if ( n_way > 1 ) + printf( "thread_range_weighted_sub(): tid %d: m n = %3d %3d do %d (lower)\n", + (int)my_id, (int)(m), (int)(n), (int)(diagoff) ); + #endif + + // Prune away the unstored region above the diagonal, if it exists, + // and then to the right of where the diagonal intersects the bottom, + // if it exists. (Also, we discard the offset deltas since we don't + // need to actually index into the subpartition.) + bli_prune_unstored_region_top_l( &diagoff, &m, &n, &offm_inc ); + + if ( !handle_edge_low ) + { + // This branch handles the following two cases: + // - note: Edge case microtiles are marked as 'e'. + // + // uplo_orig = lower | uplo = lower + // handle edge high (orig) | handle edge high + // + // x x x x x x x x x x x x x x + // x x x x x x x x x x x x x x x x + // x x x x x x x x x -> x x x x x x x x x + // x x x x x x x x x x x x x x x x x x x x + // x x x x x x x x x x e x x x x x x x x x x e + // x x x x x x x x x x e x x x x x x x x x x e + // + // uplo_orig = upper | uplo = lower + // handle edge low (orig) | handle edge high + // + // e x x x x x x x x x x x x x x x x x + // e x x x x x x x x x x x x x x x x x x + // x x x x x x x x x x -> x x x x x x x x x + // x x x x x x x x x x x x x x x x x x x + // x x x x x x x x x x x x x x x x x x e + // x x x x x x x x x x x x x x x x x e + + // If the edge case is being handled "high", then we can employ this + // simple macro for pruning the region to the right of where the + // diagonal intersets the right side of the submatrix (which amounts + // to adjusting the n dimension). + bli_prune_unstored_region_right_l( &diagoff, &m, &n, &offn_inc ); + } + else // if ( handle_edge_low ) + { + // This branch handles the following two cases: + // + // uplo_orig = upper | uplo = lower + // handle edge high (orig) | handle edge low + // + // x x x x x x x x x x e e x x x x x x + // x x x x x x x x x x e e x x x x x x x + // x x x x x x x x x e -> e x x x x x x x x + // x x x x x x x x e e x x x x x x x x x + // x x x x x x x e e x x x x x x x x x x + // x x x x x x e e x x x x x x x x x x + // + // uplo_orig = lower | uplo = lower + // handle edge low (orig) | handle edge low + // + // e x x x x x x e x x x x x x + // e x x x x x x x e x x x x x x x + // e x x x x x x x x -> e x x x x x x x x + // e x x x x x x x x x e x x x x x x x x x + // e x x x x x x x x x x e x x x x x x x x x x + // e x x x x x x x x x x e x x x x x x x x x x + + // If the edge case is being handled "low", then we have to be more + // careful. The problem can be seen in certain situations when we're + // actually computing the weighted ranges for an upper-stored + // subpartition whose (a) diagonal offset is positive (though will + // always be less than NR), (b) right-side edge case exists, and (c) + // sum of (a) and (b) is less than NR. This is a problem because the + // upcoming loop that iterates over/ bli_thread_range_width_l() + // doesn't realize that the offsets associated with (a) and (b) + // belong on two separate columns of microtiles. If we naively use + // bli_prune_unstored_region_right_l() when handle_edge_low == TRUE, + // the loop over bli_thread_range_width_l() will only "see" p-1 + // IR-iterations of work to assign to threads when there are + // actually p micropanels. + + const dim_t n_inner = ( diagoff + bli_min( m, n - diagoff ) - bf_left ); + + const dim_t n_bf_iter_br = n_inner / bf; + const dim_t n_bf_left_br = n_inner % bf; + const dim_t n_bf_br = ( bf_left > 0 ? 1 : 0 ) + + n_bf_iter_br + + ( n_bf_left_br > 0 ? 1 : 0 ); + + // Compute the number of extra columns that were included in n_bf_br + // as a result of including a full micropanel for the part of the + // submatrix that contains bf_left columns. For example, if bf = 16 + // and bf_left = 4, then bf_extra = 12. But if bf_left = 0, then we + // didn't include any extra columns. + const dim_t bf_extra = ( bf_left > 0 ? bf - bf_left : 0 ); + + // Subtract off bf_extra from n_bf_br to arrive at the "true" value + // of n that we'll use going forward. + n = n_bf_br * bf - bf_extra; + + #if 0 + if ( n_way > 1 ) + { + //printf( "thread_range_weighted_sub(): tid %d: _iter _left = %3d %3d (lower1)\n", + // (int)my_id, (int)n_bf_iter_br, (int)n_bf_left_br ); + printf( "thread_range_weighted_sub(): tid %d: m n = %3d %3d do %d (lower2)\n", + (int)my_id, (int)(m), (int)(n), (int)(diagoff) ); + } + #endif + } + + // We don't need offm_inc, offn_inc here. These statements should + // prevent compiler warnings. + ( void )offm_inc; + ( void )offn_inc; + + // Now that pruning has taken place, we know that diagoff >= 0. + + // Compute the total area of the submatrix, accounting for the + // location of the diagonal. This is done by computing the area in + // the strictly upper triangle, subtracting it off the area of the + // full rectangle, and then adding the missing strictly upper + // triangles of the bf x bf blocks along the diagonal. + double tri_dim = ( double )( n - diagoff - 1 ); + tri_dim = bli_min( tri_dim, m - 1 ); + double utri_area = tri_dim * ( tri_dim + 1.0 ) / 2.0; + + // Note that the expression below is the simplified form of: + // blktri_area = ( tri_dim / bf ) * bf * ( bf - 1.0 ) / 2.0; + double blktri_area = tri_dim * ( bf - 1.0 ) / 2.0; + + // Compute the area of the region to the right of where the diagonal + // intersects the bottom edge of the submatrix. If it instead intersects + // the right edge (or the bottom-right corner), then this region does + // not exist and so its area is explicitly set to zero. + double beyondtri_dim = n - diagoff - m; + double beyondtri_area; + if ( 0 < beyondtri_dim ) beyondtri_area = beyondtri_dim * m; + else beyondtri_area = 0.0; + + // Here, we try to account for the added cost of computing columns of + // microtiles that intersect the diagonal. This is rather difficult to + // model, but this is partly due to the way non-square microtiles map + // onto the matrix relative to the diagonal, as well as additional + // overhead incurred from (potentially) computing with less-than-full + // columns of microtiles (i.e., columns for which diagoff_j < 0). + // Note that higher values for blktri_area have the net effect of + // increasing the relative size of slabs that share little or no overlap + // with the diagonal region. this is because it slightly increases the + // total area computation below, which in turn increases the area + // targeted by each thread/group earlier in the thread range, which + // for lower trapezoidal submatrices, corresponds to the regular + // rectangular region that precedes the diagonal part (if such a + // rectangular region exists). + blktri_area *= 1.5; + //blktri_area = 0.0; + + double area_total = ( double )m * ( double )n - utri_area + blktri_area + - beyondtri_area; + + // Divide the computed area by the number of ways of parallelism. + double area_per_thr = area_total / ( double )n_way; + + + // Initialize some variables prior to the loop: the offset to the + // current subpartition, the remainder of the n dimension, and + // the diagonal offset of the current subpartition. + dim_t off_j = 0; + doff_t diagoff_j = diagoff; + dim_t n_left = n; + + #if 0 + printf( "thread_range_weighted_sub(): tid %d: n_left = %3d (lower4)\n", + (int)my_id, (int)(n_left) ); + #endif + + // Iterate over the subpartition indices corresponding to each + // thread/caucus participating in the n_way parallelism. + for ( dim_t j = 0; j < n_way; ++j ) + { + // Compute the width of the jth subpartition, taking the + // current diagonal offset into account, if needed. + dim_t width_j + = + bli_thread_range_width_l + ( + diagoff_j, m, n_left, + j, n_way, + bf, bf_left, + area_per_thr, + handle_edge_low + ); + + #if 0 + if ( n_way > 1 ) + printf( "thread_range_weighted_sub(): tid %d: width_j = %d doff_j = %d\n", + (int)my_id, (int)width_j, (int)diagoff_j ); + #endif + + // If the current thread belongs to caucus j, this is his + // subpartition. So we compute the implied index range and + // end our search. + #if 0 + // An alternate way of assigning work to threads such that regions + // are assigned to threads left to right *after* accounting for the + // fact that we recycle the same lower-trapezoidal code to also + // compute the upper-trapezoidal case. + bool is_my_range; + if ( bli_is_lower( uplo_orig ) ) is_my_range = ( j == my_id ); + else is_my_range = ( j == n_way - my_id - 1 ); + #else + bool is_my_range = ( j == my_id ); + #endif + + if ( is_my_range ) + { + *j_start_thr = off_j; + *j_end_thr = off_j + width_j; + + #if 0 + if ( n_way > 1 ) + printf( "thread_range_weighted_sub(): tid %d: sta end = %3d %3d\n", + (int)my_id, (int)(*j_start_thr), (int)(*j_end_thr) ); + //printf( "thread_range_weighted_sub(): tid %d: n_left = %3d\n", + // (int)my_id, (int)(n) ); + #endif + + // Compute the area of the thread's current subpartition in case + // the caller is curious how much work they were assigned. + // NOTE: This area computation isn't actually needed for BLIS to + // function properly.) + area = bli_find_area_trap_l( diagoff_j, m, width_j, bf ); + + break; + } + + // Shift the current subpartition's starting and diagonal offsets, + // as well as the remainder of the n dimension, according to the + // computed width, and then iterate to the next subpartition. + off_j += width_j; + diagoff_j -= width_j; + n_left -= width_j; + } + } + else // if ( bli_is_upper( uplo ) ) + { + // Express the upper-stored case in terms of the lower-stored case. + + #if 0 + if ( n_way > 1 ) + printf( "thread_range_weighted_sub(): tid %d: m n = %3d %3d do %d (upper)\n", + (int)my_id, (int)(m), (int)(n), (int)(diagoff) ); + #endif + + // First, we convert the upper-stored trapezoid to an equivalent + // lower-stored trapezoid by rotating it 180 degrees. + bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); + + // Now that the trapezoid is "flipped" in the n dimension, negate + // the bool that encodes whether to handle the edge case at the + // low (or high) end of the index range. + bli_toggle_bool( &handle_edge_low ); + + // Compute the appropriate range for the rotated trapezoid. + area = bli_thread_range_weighted_sub + ( + thread, diagoff, uplo, uplo_orig, m, n, bf, + handle_edge_low, + j_start_thr, j_end_thr + ); + + // Reverse the indexing basis for the subpartition ranges so that + // the indices, relative to left-to-right iteration through the + // unrotated upper-stored trapezoid, map to the correct columns + // (relative to the diagonal). This amounts to subtracting the + // range from n. + bli_reverse_index_direction( n, j_start_thr, j_end_thr ); + } + + return area; +} + +// ----------------------------------------------------------------------------- + +siz_t bli_thread_range_mdim + ( + dir_t direct, + const thrinfo_t* thr, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntl_t* cntl, + const cntx_t* cntx, + dim_t* start, + dim_t* end + ) +{ + bszid_t bszid = bli_cntl_bszid( cntl ); + opid_t family = bli_cntl_family( cntl ); + + // This is part of trsm's current implementation, whereby right side + // cases are implemented in left-side micro-kernels, which requires + // we swap the usage of the register blocksizes for the purposes of + // packing A and B. + if ( family == BLIS_TRSM ) + { + if ( bli_obj_root_is_triangular( a ) ) bszid = BLIS_MR; + else bszid = BLIS_NR; + } + + const blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); + const obj_t* x; + bool use_weighted; + + // Use the operation family to choose the one of the two matrices + // being partitioned that potentially has structure, and also to + // decide whether or not we need to use weighted range partitioning. + // NOTE: It's important that we use non-weighted range partitioning + // for hemm and symm (ie: the gemm family) because the weighted + // function will mistakenly skip over unstored regions of the + // structured matrix, even though they represent part of that matrix + // that will be dense and full (after packing). + if ( family == BLIS_GEMM ) { x = a; use_weighted = FALSE; } + else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_TRMM ) { x = a; use_weighted = TRUE; } + else /*family == BLIS_TRSM*/ { x = a; use_weighted = FALSE; } + + if ( use_weighted ) + { + if ( direct == BLIS_FWD ) + return bli_thread_range_weighted_t2b( thr, x, bmult, start, end ); + else + return bli_thread_range_weighted_b2t( thr, x, bmult, start, end ); + } + else + { + if ( direct == BLIS_FWD ) + return bli_thread_range_t2b( thr, x, bmult, start, end ); + else + return bli_thread_range_b2t( thr, x, bmult, start, end ); + } +} + +siz_t bli_thread_range_ndim + ( + dir_t direct, + const thrinfo_t* thr, + const obj_t* a, + const obj_t* b, + const obj_t* c, + const cntl_t* cntl, + const cntx_t* cntx, + dim_t* start, + dim_t* end + ) +{ + bszid_t bszid = bli_cntl_bszid( cntl ); + opid_t family = bli_cntl_family( cntl ); + + // This is part of trsm's current implementation, whereby right side + // cases are implemented in left-side micro-kernels, which requires + // we swap the usage of the register blocksizes for the purposes of + // packing A and B. + if ( family == BLIS_TRSM ) + { + if ( bli_obj_root_is_triangular( b ) ) bszid = BLIS_MR; + else bszid = BLIS_NR; + } + + const blksz_t* bmult = bli_cntx_get_bmult( bszid, cntx ); + const obj_t* x; + bool use_weighted; + + // Use the operation family to choose the one of the two matrices + // being partitioned that potentially has structure, and also to + // decide whether or not we need to use weighted range partitioning. + // NOTE: It's important that we use non-weighted range partitioning + // for hemm and symm (ie: the gemm family) because the weighted + // function will mistakenly skip over unstored regions of the + // structured matrix, even though they represent part of that matrix + // that will be dense and full (after packing). + if ( family == BLIS_GEMM ) { x = b; use_weighted = FALSE; } + else if ( family == BLIS_GEMMT ) { x = c; use_weighted = TRUE; } + else if ( family == BLIS_TRMM ) { x = b; use_weighted = TRUE; } + else /*family == BLIS_TRSM*/ { x = b; use_weighted = FALSE; } + + if ( use_weighted ) + { + if ( direct == BLIS_FWD ) + return bli_thread_range_weighted_l2r( thr, x, bmult, start, end ); + else + return bli_thread_range_weighted_r2l( thr, x, bmult, start, end ); + } + else + { + if ( direct == BLIS_FWD ) + return bli_thread_range_l2r( thr, x, bmult, start, end ); + else + return bli_thread_range_r2l( thr, x, bmult, start, end ); + } +} + +// ----------------------------------------------------------------------------- + +siz_t bli_thread_range_weighted_l2r + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + siz_t area; + + // This function assigns area-weighted ranges in the n dimension + // where the total range spans 0 to n-1 with 0 at the left end and + // n-1 at the right end. + + if ( bli_obj_intersects_diag( a ) && + bli_obj_is_upper_or_lower( a ) ) + { + num_t dt = bli_obj_dt( a ); + doff_t diagoff = bli_obj_diag_offset( a ); + uplo_t uplo = bli_obj_uplo( a ); + dim_t m = bli_obj_length( a ); + dim_t n = bli_obj_width( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + // Support implicit transposition. + if ( bli_obj_has_trans( a ) ) + { + bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); + } + + area = + bli_thread_range_weighted_sub + ( + thr, diagoff, uplo, uplo, m, n, bf, + FALSE, start, end + ); + } + else // if dense or zeros + { + area = bli_thread_range_l2r + ( + thr, a, bmult, + start, end + ); + } + + return area; +} + +siz_t bli_thread_range_weighted_r2l + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + siz_t area; + + // This function assigns area-weighted ranges in the n dimension + // where the total range spans 0 to n-1 with 0 at the right end and + // n-1 at the left end. + + if ( bli_obj_intersects_diag( a ) && + bli_obj_is_upper_or_lower( a ) ) + { + num_t dt = bli_obj_dt( a ); + doff_t diagoff = bli_obj_diag_offset( a ); + uplo_t uplo = bli_obj_uplo( a ); + dim_t m = bli_obj_length( a ); + dim_t n = bli_obj_width( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + // Support implicit transposition. + if ( bli_obj_has_trans( a ) ) + { + bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); + } + + bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); + + area = + bli_thread_range_weighted_sub + ( + thr, diagoff, uplo, uplo, m, n, bf, + TRUE, start, end + ); + } + else // if dense or zeros + { + area = bli_thread_range_r2l + ( + thr, a, bmult, + start, end + ); + } + + return area; +} + +siz_t bli_thread_range_weighted_t2b + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + siz_t area; + + // This function assigns area-weighted ranges in the m dimension + // where the total range spans 0 to m-1 with 0 at the top end and + // m-1 at the bottom end. + + if ( bli_obj_intersects_diag( a ) && + bli_obj_is_upper_or_lower( a ) ) + { + num_t dt = bli_obj_dt( a ); + doff_t diagoff = bli_obj_diag_offset( a ); + uplo_t uplo = bli_obj_uplo( a ); + dim_t m = bli_obj_length( a ); + dim_t n = bli_obj_width( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + // Support implicit transposition. + if ( bli_obj_has_trans( a ) ) + { + bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); + } + + bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); + + area = + bli_thread_range_weighted_sub + ( + thr, diagoff, uplo, uplo, m, n, bf, + FALSE, start, end + ); + } + else // if dense or zeros + { + area = bli_thread_range_t2b + ( + thr, a, bmult, + start, end + ); + } + + return area; +} + +siz_t bli_thread_range_weighted_b2t + ( + const thrinfo_t* thr, + const obj_t* a, + const blksz_t* bmult, + dim_t* start, + dim_t* end + ) +{ + siz_t area; + + // This function assigns area-weighted ranges in the m dimension + // where the total range spans 0 to m-1 with 0 at the bottom end and + // m-1 at the top end. + + if ( bli_obj_intersects_diag( a ) && + bli_obj_is_upper_or_lower( a ) ) + { + num_t dt = bli_obj_dt( a ); + doff_t diagoff = bli_obj_diag_offset( a ); + uplo_t uplo = bli_obj_uplo( a ); + dim_t m = bli_obj_length( a ); + dim_t n = bli_obj_width( a ); + dim_t bf = bli_blksz_get_def( dt, bmult ); + + // Support implicit transposition. + if ( bli_obj_has_trans( a ) ) + { + bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); + } + + bli_reflect_about_diag( &diagoff, &uplo, &m, &n ); + + bli_rotate180_trapezoid( &diagoff, &uplo, &m, &n ); + + area = bli_thread_range_weighted_sub + ( + thr, diagoff, uplo, uplo, m, n, bf, + TRUE, start, end + ); + } + else // if dense or zeros + { + area = bli_thread_range_b2t + ( + thr, a, bmult, + start, end + ); + } + + return area; +} + diff --git a/frame/thread/bli_thread_range.h b/frame/thread/bli_thread_range.h new file mode 100644 index 0000000000..cf966b5a35 --- /dev/null +++ b/frame/thread/bli_thread_range.h @@ -0,0 +1,128 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2016, Hewlett Packard Enterprise Development LP + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_THREAD_RANGE_H +#define BLIS_THREAD_RANGE_H + +// Thread range-related prototypes. + +BLIS_EXPORT_BLIS void bli_thread_range_sub + ( + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end + ); + +#undef GENPROT +#define GENPROT( opname ) \ +\ +siz_t PASTEMAC0( opname ) \ + ( \ + dir_t direct, \ + const thrinfo_t* thr, \ + const obj_t* a, \ + const obj_t* b, \ + const obj_t* c, \ + const cntl_t* cntl, \ + const cntx_t* cntx, \ + dim_t* start, \ + dim_t* end \ + ); + +GENPROT( thread_range_mdim ) +GENPROT( thread_range_ndim ) + +#undef GENPROT +#define GENPROT( opname ) \ +\ +siz_t PASTEMAC0( opname ) \ + ( \ + const thrinfo_t* thr, \ + const obj_t* a, \ + const blksz_t* bmult, \ + dim_t* start, \ + dim_t* end \ + ); + +GENPROT( thread_range_l2r ) +GENPROT( thread_range_r2l ) +GENPROT( thread_range_t2b ) +GENPROT( thread_range_b2t ) + +GENPROT( thread_range_weighted_l2r ) +GENPROT( thread_range_weighted_r2l ) +GENPROT( thread_range_weighted_t2b ) +GENPROT( thread_range_weighted_b2t ) + + +dim_t bli_thread_range_width_l + ( + doff_t diagoff_j, + dim_t m, + dim_t n_j, + dim_t j, + dim_t n_way, + dim_t bf, + dim_t bf_left, + double area_per_thr, + bool handle_edge_low + ); +siz_t bli_find_area_trap_l + ( + doff_t diagoff, + dim_t m, + dim_t n, + dim_t bf + ); + +siz_t bli_thread_range_weighted_sub + ( + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + uplo_t uplo_orig, + dim_t m, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* j_start_thr, + dim_t* j_end_thr + ); + +#endif diff --git a/frame/thread/bli_thread_range_slab_rr.c b/frame/thread/bli_thread_range_slab_rr.c new file mode 100644 index 0000000000..be44323096 --- /dev/null +++ b/frame/thread/bli_thread_range_slab_rr.c @@ -0,0 +1,134 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +void bli_thread_range_quad + ( + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + dim_t m, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + +#ifdef BLIS_ENABLE_JRIR_RR + + const dim_t tid = bli_thrinfo_work_id( thread ); + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 ); + + // Use round-robin (interleaved) partitioning of jr/ir loops. + *start = tid; + *end = n_iter; + *inc = jr_nt; + +#else // #elif defined( BLIS_ENABLE_JRIR_SLAB ) || + // defined( BLIS_ENABLE_JRIR_TLB ) + + // NOTE: While this cpp conditional branch applies to both _SLAB and _TLB + // cases, this *function* should never be called when BLIS_ENABLE_JRIR_TLB + // is defined, since the function is only called from macrokernels that were + // designed for slab/rr partitioning. + + const dim_t jr_nt = bli_thrinfo_n_way( thread ); + const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 ); + + // If there is no parallelism in this loop, set the output variables + // and return early. + if ( jr_nt == 1 ) { *start = 0; *end = n_iter; *inc = 1; return; } + + // Local variables for the computed start, end, and increment. + dim_t st, en, in; + + if ( bli_intersects_diag_n( diagoff, m, n ) ) + { + // If the current submatrix intersects the diagonal, try to be + // intelligent about how threads are assigned work by using the + // quadratic partitioning function. + + bli_thread_range_weighted_sub + ( + thread, diagoff, uplo, uplo, m, n, bf, + handle_edge_low, &st, &en + ); + in = bf; + } + else + { + // If the current submatrix does not intersect the diagonal, then we + // are free to perform a uniform (and contiguous) slab partitioning. + + bli_thread_range_sub + ( + thread, n, bf, + handle_edge_low, &st, &en + ); + in = bf; + } + + // Convert the start and end column indices into micropanel indices by + // dividing by the blocking factor (which, for the jr loop, is NR). If + // either one yields a remainder, add an extra unit to the result. This + // is necessary for situations where there are t threads with t-1 or + // fewer micropanels of work, including an edge case. For example, if + // t = 3 and n = 10 (with bf = NR = 8), then we want start and end for + // each thread to be: + // + // column index upanel index + // tid 0: start, end = 0, 8 -> start, end = 0, 1 + // tid 1: start, end = 8, 10 -> start, end = 1, 2 + // tid 2: start, end = 10, 10 -> start, end = 2, 2 + // + // In this example, it's important that thread (tid) 2 gets no work, and + // we express that by specifying start = end = n, which is a non-existent + // column index. + + if ( st % bf == 0 ) *start = st / bf; + else *start = st / bf + 1; + + if ( en % bf == 0 ) *end = en / bf; + else *end = en / bf + 1; + + *inc = in / bf; + +#endif +} diff --git a/frame/thread/bli_thread_range_slab_rr.h b/frame/thread/bli_thread_range_slab_rr.h new file mode 100644 index 0000000000..3e9797363b --- /dev/null +++ b/frame/thread/bli_thread_range_slab_rr.h @@ -0,0 +1,116 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_THREAD_RANGE_SLAB_RR_H +#define BLIS_THREAD_RANGE_SLAB_RR_H + +BLIS_INLINE void bli_thread_range_rr + ( + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + const dim_t tid = bli_thrinfo_work_id( thread ); + const dim_t nt = bli_thrinfo_n_way( thread ); + const dim_t n_iter = n / bf + ( n % bf ? 1 : 0 ); + + // Use round-robin (interleaved) partitioning of jr/ir loops. + *start = tid; + *end = n_iter; + *inc = nt; +} + +BLIS_INLINE void bli_thread_range_sl + ( + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + // Use contiguous slab partitioning of jr/ir loops. + bli_thread_range_sub( thread, n, bf, handle_edge_low, start, end ); + *inc = 1; +} + +BLIS_INLINE void bli_thread_range_slrr + ( + const thrinfo_t* thread, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + // Define a general-purpose slab/rr function whose definition depends on + // whether slab or round-robin partitioning was requested at configure-time. + // Note that this function also uses the slab code path when tlb is enabled. + // If this is ever changed, make sure to change bli_is_my_iter() since they + // are used together by packm. + +#ifdef BLIS_ENABLE_JRIR_RR + bli_thread_range_rr( thread, n, bf, handle_edge_low, start, end, inc ); +#else // ifdef ( _SLAB || _TLB ) + bli_thread_range_sl( thread, n, bf, handle_edge_low, start, end, inc ); +#endif +} + +// ----------------------------------------------------------------------------- + +void bli_thread_range_quad + ( + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + dim_t m, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ); + +#endif + diff --git a/frame/thread/bli_thread_range_tlb.c b/frame/thread/bli_thread_range_tlb.c new file mode 100644 index 0000000000..546ed341d6 --- /dev/null +++ b/frame/thread/bli_thread_range_tlb.c @@ -0,0 +1,1699 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +// ----------------------------------------------------------------------------- + +#define PRINT_MODE +#define PGUARD if ( 0 ) +//#define PRINT_RESULT + + +#if 0 +dim_t bli_thread_range_tlb + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const uplo_t uplo, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + dim_t n_ut_for_me; + + if ( bli_is_lower( uplo ) ) + { + n_ut_for_me = bli_thread_range_tlb_l + ( + nt, tid, diagoff, m_iter, n_iter, mr, nr, j_st_p, i_st_p + ); + } + else if ( bli_is_upper( uplo ) ) + { + n_ut_for_me = bli_thread_range_tlb_u + ( + nt, tid, diagoff, m_iter, n_iter, mr, nr, j_st_p, i_st_p + ); + } + else // if ( bli_is_dense( uplo ) ) + { + n_ut_for_me = bli_thread_range_tlb_d + ( + nt, tid, m_iter, n_iter, mr, nr, j_st_p, i_st_p + ); + } + + return n_ut_for_me; +} +#endif + +// ----------------------------------------------------------------------------- + +dim_t bli_thread_range_tlb_l + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + // This function implements tile-level load balancing for a + // lower-trapezoidal submatrix. This partitioning guarantees that all + // threads are assigned nearly the same number of microtiles-worth of work, + // with a maximum imbalance of one microtile. It makes no effort, however, + // to account for differences in threads' workload that is attributable to + // differences in the number of edge-case (or diagonal-intersecting) + // microtiles (which incur slightly more work since they must first write + // to a temporary microtile before updating the output C matrix). + + // Assumption: -mr < diagoff. Make sure to prune leading rows beforehand! + if ( diagoff <= -mr ) bli_abort(); + + // + // -- Step 1: Compute the computational area of the region ----------------- + // + + // Compute the m and n dimensions according to m_iter and n_iter. (These + // m and n dims will likely be larger than the actual m and n since they + // "round up" the edge case microtiles into full-sized microtiles.) + const dim_t m = m_iter * mr; + const dim_t n = n_iter * nr; + + // For the purposes of many computations in this function, we aren't + // interested in the extent to which diagoff exceeds n (if it does) + // So we use a new variable that is guaranteed to be no greater than n. + const doff_t diagoffmin = bli_min( diagoff, n ); + + const dim_t m_rect = m; + const dim_t n_rect = ( diagoffmin / nr ) * nr; + + const dim_t rect_area = m_rect * n_rect; + const dim_t nonrect_area = m * n - rect_area; + + //const dim_t offn_rect = 0; + const dim_t offn_nonrect = n_rect; + const dim_t diagoff_nonrect = diagoffmin - n_rect; //diagoff % nr; + + const dim_t n_nonrect = n - n_rect; + + const dim_t offn_ut_nonrect = ( diagoffmin / nr ); + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "min(diagoff,n): %7ld\n", diagoffmin ); + PGUARD printf( "offn_ut_nonrect: %7ld\n", offn_ut_nonrect ); + PGUARD printf( "offn_nonrect: %7ld\n", offn_nonrect ); + PGUARD printf( "diagoff_nonrect: %7ld\n", diagoff_nonrect ); + PGUARD printf( "n_nonrect: %7ld\n", n_nonrect ); + PGUARD printf( "---------------------------\n" ); + + dim_t num_unref_ut = 0; + + // Count the number of unreferenced utiles strictly above the diagonal. + for ( dim_t j = 0; j < n_nonrect; j += nr ) + { + const dim_t diagoff_j = diagoff_nonrect - j; + + // diagoff_j will always be at most nr - 1, but will typically be + // negative. This is because the non-rectangular region's diagonal + // offset will be at most nr - 1 for the first column of microtiles, + // since if it were more than nr - 1, that column would have already + // been pruned away (via the implicit pruning of diagoff_nonrect). + // NOTE: We use bli_max() to ensure that -diagoff_j / mr does not + // become negative, which can only happen if "top" pruning is not + // performed beforehand (and so it really isn't necessary here). + const dim_t num_unref_ut_j = bli_max( ( -diagoff_j / mr ), 0 ); + + num_unref_ut += num_unref_ut_j; + + PGUARD printf( "j %7ld\n", j ); + PGUARD printf( "diagoff_j %7ld\n", diagoff_j ); + PGUARD printf( "num_unref_ut_j %7ld\n", num_unref_ut_j ); + PGUARD printf( "num_unref_ut %7ld\n", num_unref_ut ); + PGUARD printf( "\n" ); + } + PGUARD printf( "---------------------------\n" ); + + const dim_t tri_unref_area = num_unref_ut * mr * nr; + const dim_t tri_ref_area = nonrect_area - tri_unref_area; + const dim_t total_ref_area = rect_area + tri_ref_area; + + PGUARD printf( "gross area: %7ld\n", m * n ); + PGUARD printf( "rect_area: %7ld\n", rect_area ); + PGUARD printf( "nonrect_area: %7ld\n", nonrect_area ); + PGUARD printf( "tri_unref_area: %7ld\n", tri_unref_area ); + PGUARD printf( "tri_ref_area: %7ld\n", tri_ref_area ); + PGUARD printf( "total_ref_area: %7ld\n", total_ref_area ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 2: Compute key utile counts (per thread, per column, etc.) ------ + // + + const dim_t n_ut_ref = total_ref_area / ( mr * nr ); + //const dim_t n_ut_tri_ref = tri_ref_area / ( mr * nr ); + const dim_t n_ut_rect = rect_area / ( mr * nr ); + + PGUARD printf( "n_ut_ref: %7ld\n", n_ut_ref ); + //PGUARD printf( "n_ut_tri_ref: %7ld\n", n_ut_tri_ref ); + PGUARD printf( "n_ut_rect: %7ld\n", n_ut_rect ); + PGUARD printf( "---------------------------\n" ); + + // Compute the number of microtiles to allocate per thread as well as the + // number of leftover microtiles. + const dim_t n_ut_per_thr = n_ut_ref / nt; + const dim_t n_ut_pt_left = n_ut_ref % nt; + + PGUARD printf( "n_ut_per_thr: %7ld\n", n_ut_per_thr ); + PGUARD printf( "n_ut_pt_left: %7ld\n", n_ut_pt_left ); + PGUARD printf( "---------------------------\n" ); + + const dim_t n_ut_per_col = m_iter; + + PGUARD printf( "n_ut_per_col: %7ld\n", n_ut_per_col ); + + // Allocate one of the leftover microtiles to the current thread if its + // tid is one of the lower thread ids. + const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 ); + + PGUARD printf( "n_ut_for_me: %7ld (%ld+%ld)\n", n_ut_for_me, + n_ut_per_thr, n_ut_for_me - n_ut_per_thr ); + + // Compute the number of utiles prior to the current thread's starting + // point. This is the sum of all n_ut_for_me for all thread ids less + // than tid. Notice that the second half of this expression effectively + // adds one extra microtile for each lower-valued thread id, up to + // n_ut_pt_left. + const dim_t n_ut_before = tid * n_ut_per_thr + bli_min( tid, n_ut_pt_left ); + + PGUARD printf( "n_ut_before: %7ld\n", n_ut_before ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 3: Compute the starting j/i utile offset for a given tid -------- + // + + dim_t j_st; + dim_t i_st; + + if ( n_ut_before < n_ut_rect ) + { + // This branch handles scenarios where the number of microtiles + // assigned to lower thread ids is strictly less than the number of + // utiles in the rectangular region. This means that calculating the + // starting microtile index is easy (because it does not need to + // take the location of the diagonal into account). + + PGUARD printf( "Rectangular region: n_ut_before < n_ut_rect\n" ); + PGUARD printf( "\n" ); + + const dim_t ut_index_rect_st = n_ut_before; + + PGUARD printf( "ut_index_st: %7ld\n", ut_index_rect_st ); + PGUARD printf( "---------------------------\n" ); + + j_st = ut_index_rect_st / n_ut_per_col; + i_st = ut_index_rect_st % n_ut_per_col; + + PGUARD printf( "j_st, i_st (fnl=) %4ld,%4ld\n", j_st, i_st ); + } + else // if ( n_ut_rect <= n_ut_before ) + { + // This branch handles scenarios where the number of microtiles + // assigned to lower thread ids exceeds (or equals) the number of + // utiles in the rectangular region. This means we need to observe the + // location of the diagonal to see how many utiles are referenced per + // column of utiles. + + PGUARD printf( "Diagonal region: n_ut_rect <= n_ut_before\n" ); + PGUARD printf( "\n" ); + + // This will be the number of microtile columns we will immediately + // advance past to get to the diagonal region. + const dim_t n_ut_col_adv = offn_ut_nonrect; + + PGUARD printf( "n_ut_col_adv: %7ld\n", n_ut_col_adv ); + + // In order to find j_st and i_st, we need to "allocate" n_ut_before + // microtiles. + dim_t n_ut_tba = n_ut_before; + + PGUARD printf( "n_ut_tba: %7ld\n", n_ut_tba ); + + // Advance past the rectangular region, decrementing n_ut_tba + // accordingly. + n_ut_tba -= n_ut_per_col * n_ut_col_adv; + + PGUARD printf( "n_ut_tba_1: %7ld\n", n_ut_tba ); + PGUARD printf( "\n" ); + + // In case n_ut_tba == 0. Only happens when n_ut_before == n_ut_rect. + j_st = n_ut_col_adv; + i_st = 0; + + for ( dim_t j = n_ut_col_adv; 0 < n_ut_tba; ++j ) + { + const dim_t diagoff_j = diagoffmin - j*nr; + const dim_t n_ut_skip_j = bli_max( -diagoff_j / mr, 0 ); + const dim_t n_ut_this_col = n_ut_per_col - n_ut_skip_j; + + PGUARD printf( "j: %7ld\n", j ); + PGUARD printf( "diagoff_j: %7ld\n", diagoff_j ); + PGUARD printf( "n_ut_skip_j: %7ld\n", n_ut_skip_j ); + PGUARD printf( "n_ut_this_col: %7ld\n", n_ut_this_col ); + PGUARD printf( "n_ut_tba_j0: %7ld\n", n_ut_tba ); + + if ( n_ut_tba < n_ut_this_col ) + { + // If the number of utiles to allocate is less than the number + // in this column, we know that j_st will refer to the current + // column. To find i_st, we first skip to the utile that + // intersects the diagonal and then add n_ut_tba. + j_st = j; + i_st = n_ut_skip_j + n_ut_tba; + PGUARD printf( "j_st, i_st (fnl<) %4ld,%4ld\n", j_st, i_st ); + } + else if ( n_ut_tba == n_ut_this_col ) + { + // If the number of utiles to allocate is exactly equal to the + // number in this column, we know that j_st will refer to the + // *next* column. But to find i_st, we will have to take the + // location of the diagonal into account. + const doff_t diagoff_jp1 = diagoff_j - nr; + const dim_t n_ut_skip_jp1 = bli_max( -diagoff_jp1 / mr, 0 ); + + j_st = j + 1; + i_st = n_ut_skip_jp1; + PGUARD printf( "j_st, i_st (fnl=) %4ld,%4ld\n", j_st, i_st ); + } + + // No matter what (especially if the number of utiles to allocate + // exceeds the number in this column), we decrement n_ut_tba attempt + // to continue to the next iteration. (Note: If either of the two + // branches above is triggered, n_ut_tba will be decremented down to + // zero (or less), in which case this will be the final iteration.) + n_ut_tba -= n_ut_this_col; + + PGUARD printf( "n_ut_tba_j1: %7ld\n", n_ut_tba ); + PGUARD printf( "\n" ); + } + } + + // + // -- Step 4: Save the results --------------------------------------------- + // + + *j_st_p = j_st; + *i_st_p = i_st; + + #ifdef PRINT_RESULT + printf( "j_st, i_st (mem) %4ld,%4ld (n_ut: %4ld)\n", + j_st, i_st, n_ut_for_me ); + #endif + + // Return the number of utiles that this thread was allocated. + return n_ut_for_me; +} + +// ----------------------------------------------------------------------------- + +dim_t bli_thread_range_tlb_u + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + // This function implements tile-level load balancing for an + // upper-trapezoidal submatrix. This partitioning guarantees that all + // threads are assigned nearly the same number of microtiles-worth of work, + // with a maximum imbalance of one microtile. It makes no effort, however, + // to account for differences in threads' workload that is attributable to + // differences in the number of edge-case (or diagonal-intersecting) + // microtiles (which incur slightly more work since they must first write + // to a temporary microtile before updating the output C matrix). + + // Assumption: diagoff < nr. Make sure to prune leading columns beforehand! + if ( nr <= diagoff ) bli_abort(); + + // + // -- Step 1: Compute the computational area of the region ----------------- + // + + // Compute the m and n dimensions according to m_iter and n_iter. (These + // m and n dims will likely be larger than the actual m and n since they + // "round up" the edge case microtiles into full-sized microtiles.) + const dim_t m = m_iter * mr; + const dim_t n = n_iter * nr; + + // For the purposes of many computations in this function, we aren't + // interested in the extent to which diagoff exceeds -m (if it does) + // So we use a new variable that is guaranteed to be no less than -m. + const doff_t diagoffmin = bli_max( diagoff, -m ); + + const dim_t m_rect = m; + const dim_t n_rect = ( -diagoffmin / nr ) * nr; + + const dim_t rect_area = m_rect * n_rect; + const dim_t nonrect_area = m * n - rect_area; + + const dim_t offn_rect = n - n_rect; + //const dim_t offn_nonrect = 0; + const dim_t diagoff_nonrect = diagoffmin; + + const dim_t n_nonrect = n - n_rect; + + const dim_t offn_ut_rect = n_iter + ( diagoffmin / nr ); + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "max(diagoff,-m): %7ld\n", diagoffmin ); + PGUARD printf( "offn_ut_rect: %7ld\n", offn_ut_rect ); + PGUARD printf( "offn_rect: %7ld\n", offn_rect ); + PGUARD printf( "diagoff_nonrect: %7ld\n", diagoff_nonrect ); + PGUARD printf( "n_nonrect: %7ld\n", n_nonrect ); + PGUARD printf( "---------------------------\n" ); + + dim_t num_unref_ut = 0; + + // Count the number of unreferenced utiles strictly below the diagonal. + for ( dim_t j = 0; j < n_nonrect; j += nr ) + { + const dim_t diagoff_j = diagoff_nonrect - j; + + // diagoff_j will always be at most nr - 1, but will typically be + // negative. This is because the non-rectangular region's diagonal + // offset will be at most nr - 1 for the first column of microtiles, + // since if it were more than nr - 1, that column would have already + // been pruned away (prior to this function being called). + // NOTE: We use bli_max() to ensure that ( m + diagoff_j - nr ) / mr + // does not become negative, which can happen in some situations + // during the first iteration if diagoff is relatively close to -m. + // NOTE: We subtract nr from diagoff_j since it's really the diagonal + // offset of the *next* column of utiles that needs to be used to + // determine how many utiles are referenced in the current column. + const dim_t num_unref_ut_j = bli_max( ( m + diagoff_j - nr ) / mr, 0 ); + + num_unref_ut += num_unref_ut_j; + + PGUARD printf( "j %7ld\n", j ); + PGUARD printf( "diagoff_j - nr %7ld\n", diagoff_j - nr ); + PGUARD printf( "num_unref_ut_j %7ld\n", num_unref_ut_j ); + PGUARD printf( "num_unref_ut %7ld\n", num_unref_ut ); + PGUARD printf( "\n" ); + } + PGUARD printf( "---------------------------\n" ); + + const dim_t tri_unref_area = num_unref_ut * mr * nr; + const dim_t tri_ref_area = nonrect_area - tri_unref_area; + const dim_t total_ref_area = rect_area + tri_ref_area; + + PGUARD printf( "gross area: %7ld\n", m * n ); + PGUARD printf( "rect_area: %7ld\n", rect_area ); + PGUARD printf( "nonrect_area: %7ld\n", nonrect_area ); + PGUARD printf( "tri_unref_area: %7ld\n", tri_unref_area ); + PGUARD printf( "tri_ref_area: %7ld\n", tri_ref_area ); + PGUARD printf( "total_ref_area: %7ld\n", total_ref_area ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 2: Compute key utile counts (per thread, per column, etc.) ------ + // + + const dim_t n_ut_ref = total_ref_area / ( mr * nr ); + const dim_t n_ut_tri_ref = tri_ref_area / ( mr * nr ); + //const dim_t n_ut_rect = rect_area / ( mr * nr ); + + PGUARD printf( "n_ut_ref: %7ld\n", n_ut_ref ); + PGUARD printf( "n_ut_tri_ref: %7ld\n", n_ut_tri_ref ); + //PGUARD printf( "n_ut_rect: %7ld\n", n_ut_rect ); + PGUARD printf( "---------------------------\n" ); + + // Compute the number of microtiles to allocate per thread as well as the + // number of leftover microtiles. + const dim_t n_ut_per_thr = n_ut_ref / nt; + const dim_t n_ut_pt_left = n_ut_ref % nt; + + PGUARD printf( "n_ut_per_thr: %7ld\n", n_ut_per_thr ); + PGUARD printf( "n_ut_pt_left: %7ld\n", n_ut_pt_left ); + PGUARD printf( "---------------------------\n" ); + + const dim_t n_ut_per_col = m_iter; + + PGUARD printf( "n_ut_per_col: %7ld\n", n_ut_per_col ); + + // Allocate one of the leftover microtiles to the current thread if its + // tid is one of the lower thread ids. + const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 ); + + PGUARD printf( "n_ut_for_me: %7ld (%ld+%ld)\n", n_ut_for_me, + n_ut_per_thr, n_ut_for_me - n_ut_per_thr ); + + // Compute the number of utiles prior to the current thread's starting + // point. This is the sum of all n_ut_for_me for all thread ids less + // than tid. Notice that the second half of this expression effectively + // adds one extra microtile for each lower-valued thread id, up to + // n_ut_pt_left. + const dim_t n_ut_before = tid * n_ut_per_thr + bli_min( tid, n_ut_pt_left ); + + PGUARD printf( "n_ut_before: %7ld\n", n_ut_before ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 3: Compute the starting j/i utile offset for a given tid -------- + // + + dim_t j_st; + dim_t i_st; + + if ( n_ut_tri_ref <= n_ut_before ) + { + // This branch handles scenarios where the number of microtiles + // assigned to lower thread ids exceeds (or equals) the number of + // utiles in the diagonal region. This means that calculating the + // starting microtile index is easy (because it does not need to + // take the location of the diagonal into account). + + PGUARD printf( "Rectangular region: n_ut_tri_ref <= n_ut_before\n" ); + PGUARD printf( "\n" ); + + const dim_t ut_index_rect_st = n_ut_before - n_ut_tri_ref; + + PGUARD printf( "ut_index_rect_st: %7ld\n", ut_index_rect_st ); + PGUARD printf( "---------------------------\n" ); + + j_st = offn_ut_rect + ut_index_rect_st / n_ut_per_col; + i_st = ut_index_rect_st % n_ut_per_col; + + PGUARD printf( "j_st, i_st (fnl=) %4ld,%4ld\n", j_st, i_st ); + } + else // if ( n_ut_before < n_ut_tri_ref ) + { + // This branch handles scenarios where the number of microtiles + // assigned to lower thread ids is strictly less than the number of + // utiles in the diagonal region. This means we need to observe the + // location of the diagonal to see how many utiles are referenced per + // column of utiles. + + PGUARD printf( "Diagonal region: n_ut_before < n_ut_tri_ref\n" ); + PGUARD printf( "\n" ); + + // This will be the number of microtile columns we will immediately + // advance past to get to the diagonal region. + const dim_t n_ut_col_adv = 0; + + PGUARD printf( "n_ut_col_adv: %7ld\n", n_ut_col_adv ); + + // In order to find j_st and i_st, we need to "allocate" n_ut_before + // microtiles. + dim_t n_ut_tba = n_ut_before; + + PGUARD printf( "n_ut_tba: %7ld\n", n_ut_tba ); + + // No need to advance since the upper-trapezoid begins with the + // diagonal region. + //n_ut_tba -= 0; + + PGUARD printf( "n_ut_tba_1: %7ld\n", n_ut_tba ); + PGUARD printf( "\n" ); + + // In case n_ut_tba == 0. Only happens when n_ut_before == 0. + j_st = 0; + i_st = 0; + + for ( dim_t j = n_ut_col_adv; 0 < n_ut_tba; ++j ) + { + const dim_t diagoff_j = diagoffmin - j*nr; + const dim_t n_ut_skip_j = bli_max( ( m + diagoff_j - nr ) / mr, 0 ); + const dim_t n_ut_this_col = n_ut_per_col - n_ut_skip_j; + + PGUARD printf( "j: %7ld\n", j ); + PGUARD printf( "diagoff_j: %7ld\n", diagoff_j ); + PGUARD printf( "n_ut_skip_j: %7ld\n", n_ut_skip_j ); + PGUARD printf( "n_ut_this_col: %7ld\n", n_ut_this_col ); + PGUARD printf( "n_ut_tba_j0: %7ld\n", n_ut_tba ); + + if ( n_ut_tba < n_ut_this_col ) + { + // If the number of utiles to allocate is less than the number + // in this column, we know that j_st will refer to the current + // column. To find i_st, we simply use n_ut_tba. + j_st = j; + i_st = n_ut_tba; + PGUARD printf( "j_st, i_st (fnl<) %4ld,%4ld\n", j_st, i_st ); + } + else if ( n_ut_tba == n_ut_this_col ) + { + // If the number of utiles to allocate is exactly equal to the + // number in this column, we know that j_st will refer to the + // *next* column. In this situation, i_st will always be 0. + + j_st = j + 1; + i_st = 0; + PGUARD printf( "j_st, i_st (fnl=) %4ld,%4ld\n", j_st, i_st ); + } + + // No matter what (especially if the number of utiles to allocate + // exceeds the number in this column), we decrement n_ut_tba attempt + // to continue to the next iteration. (Note: If either of the two + // branches above is triggered, n_ut_tba will be decremented down to + // zero (or less), in which case this will be the final iteration.) + n_ut_tba -= n_ut_this_col; + + PGUARD printf( "n_ut_tba_j1: %7ld\n", n_ut_tba ); + PGUARD printf( "\n" ); + } + } + + // + // -- Step 4: Save the results --------------------------------------------- + // + + *j_st_p = j_st; + *i_st_p = i_st; + + #ifdef PRINT_RESULT + printf( "j_st, i_st (mem) %4ld,%4ld (n_ut: %4ld)\n", + j_st, i_st, n_ut_for_me ); + #endif + + // Return the number of utiles that this thread was allocated. + return n_ut_for_me; +} + +// ----------------------------------------------------------------------------- + +dim_t bli_thread_range_tlb_d + ( + const dim_t nt, + const dim_t tid, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + // This function implements tile-level load balancing for a + // general/dense submatrix. This partitioning guarantees that all + // threads are assigned nearly the same number of microtiles-worth of work, + // with a maximum imbalance of one microtile. It makes no effort, however, + // to account for differences in threads' workload that is attributable to + // differences in the number of edge-case microtiles (which incur slightly + // more work since they must first write to a temporary microtile before + // updating the output C matrix). + + // + // -- Step 1: Compute the computational area of the region ----------------- + // + + // Compute the m and n dimensions according to m_iter and n_iter. (These + // m and n dims will likely be larger than the actual m and n since they + // "round up" the edge case microtiles into full-sized microtiles.) + const dim_t m = m_iter * mr; + const dim_t n = n_iter * nr; + + const dim_t m_rect = m; + const dim_t n_rect = n; + + const dim_t total_ref_area = m_rect * n_rect; + + PGUARD printf( "total_ref_area: %7ld\n", total_ref_area ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 2: Compute key utile counts (per thread, per column, etc.) ------ + // + + const dim_t n_ut_ref = total_ref_area / ( mr * nr ); + + PGUARD printf( "n_ut_ref: %7ld\n", n_ut_ref ); + PGUARD printf( "---------------------------\n" ); + + // Compute the number of microtiles to allocate per thread as well as the + // number of leftover microtiles. + const dim_t n_ut_per_thr = n_ut_ref / nt; + const dim_t n_ut_pt_left = n_ut_ref % nt; + + PGUARD printf( "n_ut_per_thr: %7ld\n", n_ut_per_thr ); + PGUARD printf( "n_ut_pt_left: %7ld\n", n_ut_pt_left ); + PGUARD printf( "---------------------------\n" ); + + const dim_t n_ut_per_col = m_iter; + + PGUARD printf( "n_ut_per_col: %7ld\n", n_ut_per_col ); + + // Allocate one of the leftover microtiles to the current thread if its + // tid is one of the lower thread ids. + const dim_t n_ut_for_me = n_ut_per_thr + ( tid < n_ut_pt_left ? 1 : 0 ); + + PGUARD printf( "n_ut_for_me: %7ld (%ld+%ld)\n", n_ut_for_me, + n_ut_per_thr, n_ut_for_me - n_ut_per_thr ); + + // Compute the number of utiles prior to the current thread's starting + // point. This is the sum of all n_ut_for_me for all thread ids less + // than tid. Notice that the second half of this expression effectively + // adds one extra microtile for each lower-valued thread id, up to + // n_ut_pt_left. + const dim_t n_ut_before = tid * n_ut_per_thr + bli_min( tid, n_ut_pt_left ); + + PGUARD printf( "n_ut_before: %7ld\n", n_ut_before ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 3: Compute the starting j/i utile offset for a given tid -------- + // + + const dim_t ut_index_st = n_ut_before; + + PGUARD printf( "ut_index_st: %7ld\n", ut_index_st ); + PGUARD printf( "---------------------------\n" ); + + const dim_t j_st = ut_index_st / n_ut_per_col; + const dim_t i_st = ut_index_st % n_ut_per_col; + + // + // -- Step 4: Save the results --------------------------------------------- + // + + *j_st_p = j_st; + *i_st_p = i_st; + + #ifdef PRINT_RESULT + printf( "j_st, i_st (mem) %4ld,%4ld (n_ut: %4ld)\n", + j_st, i_st, n_ut_for_me ); + #endif + + // Return the number of utiles that this thread was allocated. + return n_ut_for_me; +} + +// ----------------------------------------------------------------------------- + +BLIS_INLINE dim_t bli_tlb_trmm_lx_k_iter + ( + const doff_t diagoff_iter, + const uplo_t uplo, + const dim_t k_iter, + const dim_t ir_iter + ) +{ + if ( bli_is_lower( uplo ) ) + return bli_min( diagoff_iter + ( ir_iter + 1 ), k_iter ); + else // if ( bli_is_upper( uplo ) ) + return k_iter - bli_max( diagoff_iter + ir_iter, 0 ); +} + +BLIS_INLINE dim_t bli_tlb_trmm_rl_k_iter + ( + const doff_t diagoff_iter, + const dim_t k_iter, + const dim_t jr_iter + ) +{ + return k_iter - bli_max( -diagoff_iter + jr_iter, 0 ); +} + +// ----------------------------------------------------------------------------- + +dim_t bli_thread_range_tlb_trmm_ll + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + return bli_thread_range_tlb_trmm_lx_impl + ( + nt, tid, diagoff, BLIS_LOWER, m_iter, n_iter, k_iter, mr, nr, + j_st_p, i_st_p + ); +} + +dim_t bli_thread_range_tlb_trmm_lu + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + return bli_thread_range_tlb_trmm_lx_impl + ( + nt, tid, diagoff, BLIS_UPPER, m_iter, n_iter, k_iter, mr, nr, + j_st_p, i_st_p + ); +} + +dim_t bli_thread_range_tlb_trmm_lx_impl + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const uplo_t uplo, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + // Assumption: 0 <= diagoff (lower); diagoff <= 0 (upper). + // Make sure to prune leading rows (lower) or columns (upper) beforehand! + if ( bli_is_lower( uplo ) && diagoff < 0 ) bli_abort(); + else if ( bli_is_upper( uplo ) && diagoff > 0 ) bli_abort(); + + // Single-threaded cases are simple and allow early returns. + if ( nt == 1 ) + { + const dim_t n_ut_for_me = m_iter * n_iter; + + *j_st_p = 0; + *i_st_p = 0; + + return n_ut_for_me; + } + + // + // -- Step 1: Compute the computational flop cost of each utile column ----- + // + + // Normalize the diagonal offset by mr so that it represents the offset in + // units of mr x mr chunks. + const doff_t diagoff_iter = diagoff / mr; + + // Determine the actual k dimension, in units of mr x mr iterations, capped + // by the k_iter given by the caller. + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "m_iter: %7ld\n", m_iter ); + PGUARD printf( "n_iter: %7ld\n", n_iter ); + PGUARD printf( "k_iter: %7ld\n", k_iter ); + PGUARD printf( "mr: %7ld\n", mr ); + PGUARD printf( "nr: %7ld\n", nr ); + PGUARD printf( "diagoff_iter: %7ld\n", diagoff_iter ); + + dim_t uops_per_col = 0; + + // Compute the computation flop cost of each microtile column, normalized + // by the number of flops performed by each mr x nr rank-1 update. This + // is simply the sum of all of the k dimensions of each micropanel, up to + // and including (lower) or starting from (upper) the part that intersects + // the diagonal, or the right (lower) or left (upper) edge of the matrix, + // as applicable. + for ( dim_t i = 0; i < m_iter; ++i ) + { + // Don't allow k_a1011 to exceed k_iter, which is the maximum possible + // k dimension (in units of mr x mr chunks of micropanel). + const dim_t k_i_iter + = bli_tlb_trmm_lx_k_iter( diagoff_iter, uplo, k_iter, i ); + + uops_per_col += k_i_iter; + } + + PGUARD printf( "uops_per_col: %7ld\n", uops_per_col ); + + // + // -- Step 2: Compute key flop counts (per thread, per column, etc.) ------- + // + + // Compute the total cost for the entire block-panel multiply. + const dim_t total_uops = uops_per_col * n_iter; + + // Compute the number of microtile ops to allocate per thread as well as the + // number of leftover microtile ops. + const dim_t n_uops_per_thr = total_uops / nt; + const dim_t n_uops_pt_left = total_uops % nt; + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "total_uops: %7ld\n", total_uops ); + PGUARD printf( "n_uops_per_thr: %7ld\n", n_uops_per_thr ); + PGUARD printf( "n_uops_pt_left: %7ld\n", n_uops_pt_left ); + + // + // -- Step 3: Compute the starting j/i utile offset for a given tid -------- + // + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "total_utiles: %7ld\n", m_iter * n_iter ); + PGUARD printf( "---------------------------\n" ); + + dim_t j_st_cur = 0; dim_t j_en_cur = 0; + dim_t i_st_cur = 0; dim_t i_en_cur = 0; + + PGUARD printf( " tid %ld will start at j,i: %ld %ld\n", + ( dim_t )0, j_st_cur, i_st_cur ); + + // Find the utile update that pushes uops_tba to 0 or less. +#ifdef PRINT_MODE + for ( dim_t tid_i = 0; tid_i < nt; ++tid_i ) +#else + for ( dim_t tid_i = 0; tid_i < nt - 1; ++tid_i ) +#endif + { + const dim_t uops_ta = n_uops_per_thr + ( tid_i < n_uops_pt_left ? 1 : 0 ); + dim_t uops_tba = uops_ta; + dim_t j = j_st_cur; + dim_t n_ut_for_me = 0; + bool done_e = FALSE; + + PGUARD printf( "tid_i: %ld n_uops to alloc: %3ld \n", tid_i, uops_tba ); + + // This code begins allocating uops when the starting point is somewhere + // after the first microtile. Typically this will not be enough to + // allocate all uops, except for small matrices (and/or high numbers of + // threads), in which case the code signals an early finish (via done_e). + if ( 0 < i_st_cur ) + { + dim_t i; + + //PGUARD printf( "tid_i: %ld uops left to alloc: %2ld \n", tid_i, uops_tba ); + + for ( i = i_st_cur; i < m_iter; ++i ) + { + n_ut_for_me += 1; + + const dim_t uops_tba_new + = uops_tba - + bli_tlb_trmm_lx_k_iter( diagoff_iter, uplo, k_iter, i ); + + uops_tba = uops_tba_new; + + PGUARD printf( "tid_i: %ld i: %2ld (1 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, i, n_ut_for_me, uops_ta - uops_tba ); + + if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; done_e = TRUE; + break; } + } + + if ( i == m_iter ) j += 1; + } + + // This code advances over as many columns of utiles as possible and then + // walks down to the correct utile within the subsequent column. However, + // it gets skipped entirely if the previous code block was able to + // allocate all of the current tid's uops. + if ( !done_e ) + { + const dim_t j_inc0 = uops_tba / uops_per_col; + const dim_t j_left0 = uops_tba % uops_per_col; + + // We need to set a hard limit on how much j_inc can be. Namely, + // it should not exceed the number of utile columns that are left + // in the matrix. We also correctly compute j_left when the initial + // computation of j_inc0 above exceeds the revised j_inc, but this + // is mostly only so that in these situations the debug statements + // report the correct numbers. + const dim_t j_inc = bli_min( j_inc0, n_iter - j ); + const dim_t delta = j_inc0 - j_inc; + const dim_t j_left = j_left0 + delta * uops_per_col; + + // Increment j by the number of full utile columns we allocate, and + // set the remaining utile ops to be allocated to the remainder. + j += j_inc; + uops_tba = j_left; + + n_ut_for_me += j_inc * m_iter; + + PGUARD printf( "tid_i: %ld advanced to col: %2ld (uops traversed: %ld)\n", + tid_i, j, uops_per_col * j_inc ); + PGUARD printf( "tid_i: %ld j: %2ld ( n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + PGUARD printf( "tid_i: %ld uops left to alloc: %2ld \n", tid_i, j_left ); + + if ( uops_tba == 0 ) + { + // If advancing j_inc columns allocated all of our uops, then + // designate the last iteration of the previous column as the + // end point. + j_en_cur = j - 1; + i_en_cur = m_iter - 1; + } + else if ( j > n_iter ) bli_abort(); // safety check. + else if ( j == n_iter ) + { + // If we still have at least some uops to allocate, and advancing + // j_inc columns landed us at the beginning of the first non- + // existent column (column n_iter), then we're done. (The fact + // that we didn't get to allocate all of our uops just means that + // the lower tids slightly overshot their allocations, leaving + // fewer uops for the last thread.) + } + else // if ( 0 < uops_tba && j < n_iter ) + { + // If we have at least some uops to allocate, and we still have + // at least some columns to process, then we search for the + // utile that will put us over the top. + + for ( dim_t i = 0; i < m_iter; ++i ) + { + n_ut_for_me += 1; + + const dim_t uops_tba_new + = uops_tba - + bli_tlb_trmm_lx_k_iter( diagoff_iter, uplo, k_iter, i ); + + uops_tba = uops_tba_new; + + PGUARD printf( "tid_i: %ld i: %2ld (4 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, i, n_ut_for_me, uops_ta - uops_tba ); + + if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; + break; } + } + } + } + + + PGUARD printf( "tid_i: %ld (5 n_ut_cur: %ld) (overshoot: %ld out of %ld)\n", + tid_i, n_ut_for_me, -uops_tba, uops_ta ); + + if ( tid_i == tid ) + { + *j_st_p = j_st_cur; + *i_st_p = i_st_cur; + return n_ut_for_me; + } + + // Use the current tid's ending i,j values to determine the starting i,j + // values for the next tid. + j_st_cur = j_en_cur; + i_st_cur = i_en_cur + 1; + if ( i_st_cur == m_iter ) { j_st_cur += 1; i_st_cur = 0; } + + PGUARD printf( "tid_i: %ld (6 n_ut_cur: %ld)\n", + tid_i, n_ut_for_me ); + PGUARD printf( "tid_i: %ld tid %ld will start at j,i: %ld %ld\n", + tid_i, tid_i + 1, j_st_cur, i_st_cur ); + PGUARD printf( "---------------------------\n" ); + } + +#ifndef PRINT_MODE + + // + // -- Step 4: Handle the last thread's allocation -------------------------- + // + + // An optimization: The above loop runs to nt - 1 rather than nt since it's + // easy to count the number of utiles allocated to the last thread. + const dim_t n_ut_for_me = m_iter - i_st_cur + + (n_iter - j_st_cur - 1) * m_iter; + *j_st_p = j_st_cur; + *i_st_p = i_st_cur; + + PGUARD printf( "tid_i: %ld (7 n_ut_for_me: %ld) (j,i_st: %ld %ld)\n", + tid, n_ut_for_me, j_st_cur, i_st_cur ); + + return n_ut_for_me; +#else + // This line should never execute, but we need it to satisfy the compiler. + return -1; +#endif +} + +// ----------------------------------------------------------------------------- + +#if 0 +dim_t bli_thread_range_tlb_trmm_r + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const uplo_t uplo, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + dim_t n_ut_for_me; + + if ( bli_is_lower( uplo ) ) + { + inc_t j_en_l, i_en_l; + + n_ut_for_me = bli_thread_range_tlb_trmm_rl_impl + ( + nt, tid, diagoff, m_iter, n_iter, k_iter, mr, nr, + j_st_p, i_st_p, &j_en_l, &i_en_l + ); + } + else // if ( bli_is_upper( uplo ) ) + { + inc_t j_st_l, i_st_l; + inc_t j_en_l, i_en_l; + + // Reverse the effective tid and use the diagonal offset as if the m and + // n dimension were reversed (similar to a 180 degree rotation). This + // transforms the problem into one of allocating ranges for a lower- + // triangular matrix, for which we already have a special routine. + const dim_t tid_rev = nt - tid - 1; + const doff_t diagoff_rev = nr*n_iter - ( nr*k_iter + diagoff ); + + n_ut_for_me = bli_thread_range_tlb_trmm_rl_impl + ( + nt, tid_rev, diagoff_rev, m_iter, n_iter, k_iter, mr, nr, + &j_st_l, &i_st_l, &j_en_l, &i_en_l + ); + + // The ending j and i offsets will serve as our starting offsets + // returned to the caller, but first we have to reverse the offsets so + // that their semantics are once again relative to an upper-triangular + // matrix. + j_en_l = n_iter - j_en_l - 1; + i_en_l = m_iter - i_en_l - 1; + + *j_st_p = j_en_l; + *i_st_p = i_en_l; + } + + return n_ut_for_me; +} +#endif + +dim_t bli_thread_range_tlb_trmm_rl + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + inc_t j_en_l, i_en_l; + + return bli_thread_range_tlb_trmm_rl_impl + ( + nt, tid, diagoff, m_iter, n_iter, k_iter, mr, nr, + j_st_p, i_st_p, &j_en_l, &i_en_l + ); +} + +dim_t bli_thread_range_tlb_trmm_ru + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ) +{ + inc_t j_st_l, i_st_l; + inc_t j_en_l, i_en_l; + + // Reverse the effective tid and use the diagonal offset as if the m and + // n dimension were reversed (similar to a 180 degree rotation). This + // transforms the problem into one of allocating ranges for a lower- + // triangular matrix, for which we already have a special routine. + const dim_t tid_rev = nt - tid - 1; + const doff_t diagoff_rev = nr*n_iter - ( nr*k_iter + diagoff ); + + const dim_t n_ut_for_me = bli_thread_range_tlb_trmm_rl_impl + ( + nt, tid_rev, diagoff_rev, m_iter, n_iter, k_iter, mr, nr, + &j_st_l, &i_st_l, &j_en_l, &i_en_l + ); + + // The ending j and i offsets will serve as our starting offsets + // returned to the caller, but first we have to reverse the offsets so + // that their semantics are once again relative to an upper-triangular + // matrix. + j_en_l = n_iter - j_en_l - 1; + i_en_l = m_iter - i_en_l - 1; + + *j_st_p = j_en_l; + *i_st_p = i_en_l; + + return n_ut_for_me; +} + +dim_t bli_thread_range_tlb_trmm_rl_impl + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p, + inc_t* j_en_p, + inc_t* i_en_p + ) +{ + // Assumption: 0 <= diagoff. Make sure to prune leading rows beforehand! + if ( diagoff < 0 ) bli_abort(); + + // Single-threaded cases are simple and allow early returns. + if ( nt == 1 ) + { + const dim_t n_ut_for_me = m_iter * n_iter; + + *j_st_p = 0; + *i_st_p = 0; + *j_en_p = n_iter - 1; + *i_en_p = m_iter - 1; + + return n_ut_for_me; + } + + // + // -- Step 1: Compute the computational volume of the region --------------- + // + + // Normalize the diagonal offset by nr so that it represents the offset in + // units of nr x nr chunks. + const doff_t diagoff_iter = diagoff / nr; + + // For the purposes of many computations in this function, we aren't + // interested in the extent to which diagoff exceeds n (if it does) + // So we use a new variable that is guaranteed to be no greater than n. + const doff_t diagoffmin_iter = bli_min( diagoff_iter, n_iter ); + + const dim_t k_rect = k_iter; + const dim_t n_rect = diagoffmin_iter; + + const dim_t gross_area = k_rect * n_iter; + const dim_t rect_area = k_rect * n_rect; + const dim_t nonrect_area = gross_area - rect_area; + + const dim_t offn_nonrect = n_rect; + const dim_t diagoff_nonrect = 0; + + const dim_t n_nonrect = n_iter - n_rect; + + const dim_t offn_ut_nonrect = diagoffmin_iter; + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "m_iter: %7ld\n", m_iter ); + PGUARD printf( "k_iter: %7ld\n", k_iter ); + PGUARD printf( "n_iter: %7ld\n", n_iter ); + PGUARD printf( "min(diagoff_it,n): %7ld\n", diagoffmin_iter ); + PGUARD printf( "offn_ut_nonrect: %7ld\n", offn_ut_nonrect ); + PGUARD printf( "offn_nonrect: %7ld\n", offn_nonrect ); + PGUARD printf( "diagoff_nonrect: %7ld\n", diagoff_nonrect ); + PGUARD printf( "n_nonrect: %7ld\n", n_nonrect ); + PGUARD printf( "---------------------------\n" ); + + const dim_t num_unref_ut0 = n_nonrect * ( n_nonrect - 1 ) / 2; + const dim_t num_unref_ut = bli_max( 0, num_unref_ut0 ); + + const dim_t tri_unref_area = num_unref_ut; + const dim_t tri_ref_area = nonrect_area - tri_unref_area; + const dim_t total_ref_area = rect_area + tri_ref_area; + const dim_t rect_vol = rect_area * m_iter; + const dim_t tri_ref_vol = tri_ref_area * m_iter; + const dim_t total_vol = total_ref_area * m_iter; + + PGUARD printf( "gross_area: %7ld\n", gross_area ); + PGUARD printf( "nonrect_area: %7ld\n", nonrect_area ); + PGUARD printf( "tri_unref_area: %7ld\n", tri_unref_area ); + PGUARD printf( "rect_area: %7ld\n", rect_area ); + PGUARD printf( "tri_ref_area: %7ld\n", tri_ref_area ); + PGUARD printf( "total_ref_area: %7ld\n", total_ref_area ); + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "rect_vol (uops): %7ld\n", rect_vol ); + PGUARD printf( "tri_ref_vol (uops): %7ld\n", tri_ref_vol ); + PGUARD printf( "total_vol (uops): %7ld\n", total_vol ); + PGUARD printf( "---------------------------\n" ); + + // + // -- Step 2: Compute key flop counts (per thread, per column, etc.) ------- + // + + //const dim_t rect_uops = rect_vol; + //const dim_t tri_ref_uops = tri_ref_vol; + const dim_t total_uops = total_vol; + + // Compute the number of microtile ops to allocate per thread as well as the + // number of leftover microtile ops. + const dim_t n_uops_per_thr = total_uops / nt; + const dim_t n_uops_pt_left = total_uops % nt; + + PGUARD printf( "n_threads: %7ld\n", nt ); + PGUARD printf( "n_uops_per_thr: %7ld\n", n_uops_per_thr ); + PGUARD printf( "n_uops_pt_left: %7ld\n", n_uops_pt_left ); + PGUARD printf( "---------------------------\n" ); + + const dim_t uops_per_col_rect = m_iter * k_iter; + + PGUARD printf( "uops_per_col_rect: %7ld\n", uops_per_col_rect ); + + // Allocate one of the leftover uops to the current thread if its tid is + // one of the lower thread ids. + //const dim_t n_uops_for_me = n_uops_per_thr + ( tid < n_uops_pt_left ? 1 : 0 ); + + //PGUARD printf( "n_uops_for_me: %7ld (%ld+%ld)\n", + // n_uops_for_me, n_uops_per_thr, n_uops_for_me - n_uops_per_thr ); + + // + // -- Step 3: Compute the starting j/i utile offset for a given tid) ------- + // + + PGUARD printf( "---------------------------\n" ); + PGUARD printf( "total_utiles: %7ld\n", m_iter * n_iter ); + PGUARD printf( "---------------------------\n" ); + + dim_t j_st_cur = 0; dim_t j_en_cur = 0; + dim_t i_st_cur = 0; dim_t i_en_cur = 0; + + // Find the utile update that pushes uops_tba to 0 or less. +#ifdef PRINT_MODE + for ( dim_t tid_i = 0; tid_i < nt; ++tid_i ) +#else + for ( dim_t tid_i = 0; tid_i < nt - 1; ++tid_i ) +#endif + { + const dim_t uops_ta = n_uops_per_thr + ( tid_i < n_uops_pt_left ? 1 : 0 ); + dim_t uops_tba = uops_ta; + dim_t j = j_st_cur; + dim_t n_ut_for_me = 0; + bool done_e = FALSE; + bool search_tri = FALSE; + + PGUARD printf( "tid_i: %ld n_uops_ta: %3ld \n", tid_i, uops_tba ); + PGUARD printf( "tid_i: %ld j: %2ld ( n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + + // This code begins allocating uops when the starting point is somewhere + // after the first microtile. Typically this will not be enough to + // allocate all uops, except for situations where the number of threads + // is high relative to the number of utile columns, in which case the + // code signals an early finish (via done_e). + if ( 0 < i_st_cur ) + { + // Compute the number of uops needed to update each utile in the + // current column. + const dim_t k_iter_j = bli_tlb_trmm_rl_k_iter( diagoff_iter, k_iter, j ); + + dim_t i; + + #if 0 + + // Starting from i_st_cur within the current utile column, allocate + // utiles until (a) we run out of utiles in the column (which is tyipcally + // what happens), or (b) we finish allocating all uops for the current + // thread (uops_tba drops to zero or less). + for ( i = i_st_cur; i < m_iter; ++i ) + { + n_ut_for_me += 1; + + const dim_t uops_tba_new = uops_tba - k_iter_j; + + uops_tba = uops_tba_new; + + PGUARD printf( "tid_i: %ld i: %2ld (0 n_ut_cur: %ld) (uops_alloc: %ld) (k_iter_j: %ld)\n", + tid_i, i, n_ut_for_me, uops_ta - uops_tba, k_iter_j ); + + if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; done_e = TRUE; + break; } + } + + // If we traversed the entire column (regardless of whether we finished + // allocating utiles for the current thread), increment j to the next + // column, which is where we'll continue our search for the current tid + // (or start our search for the next tid if we finished allocating utiles). + // Additionally, if we finished traversing all utile columns, mark the + // last utile of the last column as the end point, and set the "done early" + // flag. + if ( i == m_iter ) + { + j += 1; + if ( j == n_iter ) { j_en_cur = j - 1; i_en_cur = m_iter - 1; done_e = TRUE; } + } + + #else + + // Compute the number of utiles left to allocate under the (probably false) + // assumption that all utiles incur the same uop cost (k_iter_j) to update. + // Also compute the number of utiles that remain in the current column. + const dim_t n_ut_tba_j = uops_tba / k_iter_j + ( uops_tba % k_iter_j ? 1 : 0 ); + const dim_t n_ut_rem_j = m_iter - i_st_cur; + + // Compare the aforementioned values. If n_ut_tba_j is less than or equal to + // the number of remaining utiles in the column, we can finish allocating + // without moving to the next column. But if n_ut_tba_j exceeds n_ut_rem_j, + // then we aren't done yet, so allocate what we can and move on. + if ( n_ut_tba_j <= n_ut_rem_j ) + { + n_ut_for_me += n_ut_tba_j; + uops_tba -= n_ut_tba_j * k_iter_j; + i = i_st_cur + n_ut_tba_j; + + j_en_cur = j; i_en_cur = i - 1; done_e = TRUE; + } + else // if ( n_ut_rem_j < n_ut_tba_j ) + { + n_ut_for_me += n_ut_rem_j; + uops_tba -= n_ut_rem_j * k_iter_j; + i = i_st_cur + n_ut_rem_j; + } + + PGUARD printf( "tid_i: %ld i: %2ld (* n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, i-1, n_ut_for_me, uops_ta - uops_tba ); + + // If we allocated all utiles in the column (regardless of whether we finished + // allocating utiles for the current thread), increment j to the next column, + // which is where we'll continue our search for the current tid's end point + // (or start our search through the next tid's range if we finished allocating + // the current tid's utiles). Additionally, if we allocated utiles from the + // last column, mark the tid's end point and set the "done early" flag. + if ( i == m_iter ) + { + j += 1; i = 0; + if ( j == n_iter ) { j_en_cur = j - 1; i_en_cur = m_iter - 1; done_e = TRUE; } + + PGUARD printf( "tid_i: %ld j: %2ld (! n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + } + + #endif + } + + // This code advances over as many columns of utiles as possible, within + // the rectangular region (i.e., pre-diagonal), and then walks down to + // the correct utile within the subsequent column. However, note that + // this code gets skipped entirely if the previous code block was able + // to allocate all of the current tid's uops. + if ( !done_e ) + { + // If j is positioned somewhere within the rectangular region, we can + // skip over as many utile columns as possible with some integer math. + // And depending on how many uops we were able to allocate relative to + // the number of columns that exist, we may need to walk through the + // triangular region as well. But if j is already in the triangular + // region, we set a flag so that we execute the code that will walk + // through those columns. + if ( j < diagoff_iter ) + { + const dim_t j_inc0 = uops_tba / uops_per_col_rect; + const dim_t j_left0 = uops_tba % uops_per_col_rect; + + // We need to set a hard limit on how much j_inc can be. Namely, + // it should not exceed the number of utile columns that are left + // in the rectangular region of the matrix, nor should it exceed + // the total number of utile columns that are left. + const dim_t j_inc1 = bli_min( j_inc0, diagoff_iter - j ); + const dim_t j_inc = bli_min( j_inc1, n_iter - j ); + const dim_t delta = j_inc0 - j_inc; + const dim_t j_left = j_left0 + delta * uops_per_col_rect; + + // Increment j by the number of full utile columns we allocate, and + // set the remaining utile ops to be allocated to the remainder. + j += j_inc; + uops_tba = j_left; + + n_ut_for_me += j_inc * m_iter; + + PGUARD printf( "tid_i: %ld advanced to col: %2ld (uops traversed: %ld)\n", + tid_i, j, uops_per_col_rect * j_inc ); + PGUARD printf( "tid_i: %ld j: %2ld (1 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + PGUARD printf( "tid_i: %ld uops left to alloc: %2ld \n", tid_i, j_left ); + + if ( uops_tba == 0 ) + { + // If advancing j_inc columns allocated all of our uops, then + // designate the last iteration of the previous column as the + // end point. + j_en_cur = j - 1; + i_en_cur = m_iter - 1; + search_tri = FALSE; + + PGUARD printf( "tid_i: %ld j: %2ld (2 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + } + else if ( j > n_iter ) bli_abort(); // Safety check; should never execute. + else if ( j == n_iter ) + { + // If we still have at least some uops to allocate, and advancing + // j_inc columns landed us at the beginning of the first non- + // existent column (column n_iter), then we're done. (The fact + // that we didn't get to allocate all of our uops just means that + // the lower tids slightly overshot their allocations, leaving + // fewer uops for the last thread.) + search_tri = FALSE; + PGUARD printf( "tid_i: %ld j: %2ld (3 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + } + else if ( j < diagoff_iter ) + { + // If we still have at least some uops to allocate, and advancing + // j_inc columns landed us at the beginning of a column that is + // still in the rectangular region, then we don't need to enter + // the triangular region (if it even exists). The code below will + // walk down the current column and find the utile that puts us + // over the top. + search_tri = FALSE; + PGUARD printf( "tid_i: %ld j: %2ld (4 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + } + else // if ( 0 < uops_tba && j == diagoff_iter && j < n_iter ) + { + // If we have at least some uops to allocate, and we still have + // at least some columns to process, then we set a flag to + // indicate that we still need to step through the triangular + // region. + search_tri = TRUE; + PGUARD printf( "tid_i: %ld j: %2ld (5 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + } + } + else /* if ( diagoff_iter <= j ) */ + { + PGUARD printf( "tid_i: %ld j: %2ld >= diagoff_iter: %ld\n", + tid_i, j, diagoff_iter ); + search_tri = TRUE; + } + + PGUARD printf( "tid_i: %ld j: %2ld search_tri: %d\n", tid_i, j, search_tri ); + + if ( search_tri ) + { + // If we still have some uops to allocate in the triangular region, + // we first allocate as many full utile columns as possible without + // exceeding the number of uops left to be allocated. + for ( ; j < n_iter; ++j ) + { + const dim_t k_iter_j = bli_tlb_trmm_rl_k_iter( diagoff_iter, k_iter, j ); + const dim_t n_uops_j = k_iter_j * m_iter; + + PGUARD printf( "tid_i: %ld j: %2ld (6 n_ut_cur: %ld) (uops_alloc: %ld) (n_uops_j: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba, n_uops_j ); + + if ( uops_tba == 0 ) + { + PGUARD printf( "tid_i: %ld j: %2ld (7 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + // If advancing over the previous column allocated all of + // our uops, then designate the last iteration of the + // previous column as the end point. + j_en_cur = j - 1; + i_en_cur = m_iter - 1; + break; + } + if ( n_uops_j <= uops_tba ) + { + // If advancing over the current column doesn't exceed the + // number of uops left to allocate, then allocate them. (If + // n_uops_j == uops_tba, then we'll be done shortly after + // incrementing j.) + n_ut_for_me += m_iter; + uops_tba -= n_uops_j; + + PGUARD printf( "tid_i: %ld j: %2ld (8 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + } + else // if ( uops_tba < n_uops_j ) + { + PGUARD printf( "tid_i: %ld j: %2ld (9 n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba ); + // If we can finish allocating all the remaining uops + // with the utiles in the current column, then we break + // out of the loop without updating j, n_ut_for_me, or + // uops_tba. The remaining uops will be allocated in + // the loop over m_iter below. + break; + } + } + } + + // If there are any uops left to allocate, and we haven't already + // exhausted all allocatable utiles, it means that we have to walk down + // the current column and find the utile that puts us over the top. + if ( 0 < uops_tba && j < n_iter ) + { + const dim_t k_iter_j = bli_tlb_trmm_rl_k_iter( diagoff_iter, k_iter, j ); + + PGUARD printf( "tid_i: %ld j: %2ld (A n_ut_cur: %ld) (uops_alloc: %ld) (k_iter_j: %ld)\n", + tid_i, j, n_ut_for_me, uops_ta - uops_tba, k_iter_j ); + + #if 0 + + dim_t i; + for ( i = 0; i < m_iter; ++i ) + { + n_ut_for_me += 1; + const dim_t uops_tba_new = uops_tba - k_iter_j; + uops_tba = uops_tba_new; + PGUARD printf( "tid_i: %ld i: %2ld (B n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, i, n_ut_for_me, uops_ta - uops_tba ); + if ( uops_tba_new <= 0 ) { j_en_cur = j; i_en_cur = i; break; } + } + + if ( i == m_iter ) + { + j += 1; + if ( j == n_iter ) { j_en_cur = j - 1; i_en_cur = m_iter - 1; } + } + + #else + + const dim_t n_ut_j = uops_tba / k_iter_j + ( uops_tba % k_iter_j ? 1 : 0 ); + const dim_t i = n_ut_j - 1; + + uops_tba -= n_ut_j * k_iter_j; + n_ut_for_me += n_ut_j; + + j_en_cur = j; i_en_cur = i; + + PGUARD printf( "tid_i: %ld i: %2ld (b n_ut_cur: %ld) (uops_alloc: %ld)\n", + tid_i, i, n_ut_for_me, uops_ta - uops_tba ); + + #endif + } + else // if ( uops_tba <= 0 || j == n_iter ) + { + j_en_cur = j - 1; + i_en_cur = m_iter - 1; + } + } + + PGUARD printf( "tid_i: %ld done! (C n_ut_cur: %ld) (overshoot: %ld out of %ld)\n", + tid_i, n_ut_for_me, -uops_tba, uops_ta ); + + if ( tid_i == tid ) + { + *j_st_p = j_st_cur; + *i_st_p = i_st_cur; + *j_en_p = j_en_cur; + *i_en_p = i_en_cur; + return n_ut_for_me; + } + + // Use the current tid's ending i,j values to determine the starting i,j + // values for the next tid. + j_st_cur = j_en_cur; + i_st_cur = i_en_cur + 1; + if ( i_st_cur == m_iter ) { j_st_cur += 1; i_st_cur = 0; } + + PGUARD printf( "tid_i: %ld (D n_ut_cur: %ld)\n", + tid_i, n_ut_for_me ); + PGUARD printf( "tid_i: %ld tid %ld will start at j,i: %ld %ld\n", + tid_i, tid_i + 1, j_st_cur, i_st_cur ); + PGUARD printf( "---------------------------\n" ); + } + +#ifndef PRINT_MODE + + // + // -- Step 4: Handle the last thread's allocation -------------------------- + // + + // An optimization: The above loop runs to nt - 1 rather than nt since it's + // easy to count the number of utiles allocated to the last thread. + const dim_t n_ut_for_me = m_iter - i_st_cur + + (n_iter - j_st_cur - 1) * m_iter; + *j_st_p = j_st_cur; + *i_st_p = i_st_cur; + *j_en_p = n_iter - 1; + *i_en_p = m_iter - 1; + + PGUARD printf( "tid_i: %ld (E n_ut_for_me: %ld) (j,i_st: %ld %ld)\n", + tid, n_ut_for_me, j_st_cur, i_st_cur ); + + return n_ut_for_me; +#else + // This line should never execute, but we need it to satisfy the compiler. + return -1; +#endif +} + diff --git a/frame/thread/bli_thread_range_tlb.h b/frame/thread/bli_thread_range_tlb.h new file mode 100644 index 0000000000..b344f09ef8 --- /dev/null +++ b/frame/thread/bli_thread_range_tlb.h @@ -0,0 +1,192 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2022, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#ifndef BLIS_THREAD_RANGE_TLB_H +#define BLIS_THREAD_RANGE_TLB_H + +#if 0 +dim_t bli_thread_range_tlb + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const uplo_t uplo, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +#endif +dim_t bli_thread_range_tlb_l + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +dim_t bli_thread_range_tlb_u + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +dim_t bli_thread_range_tlb_d + ( + const dim_t nt, + const dim_t tid, + const dim_t m_iter, + const dim_t n_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); + +// --- + +dim_t bli_thread_range_tlb_trmm_ll + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +dim_t bli_thread_range_tlb_trmm_lu + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +dim_t bli_thread_range_tlb_trmm_lx_impl + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const uplo_t uplo, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +#if 0 +dim_t bli_thread_range_tlb_trmm_r + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const uplo_t uplo, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +#endif + +// --- + +dim_t bli_thread_range_tlb_trmm_rl + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +dim_t bli_thread_range_tlb_trmm_ru + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p + ); +dim_t bli_thread_range_tlb_trmm_rl_impl + ( + const dim_t nt, + const dim_t tid, + const doff_t diagoff, + const dim_t m_iter, + const dim_t n_iter, + const dim_t k_iter, + const dim_t mr, + const dim_t nr, + inc_t* j_st_p, + inc_t* i_st_p, + inc_t* j_en_p, + inc_t* i_en_p + ); + +#endif diff --git a/frame/thread/old/bli_thread_range_snake.c b/frame/thread/old/bli_thread_range_snake.c new file mode 100644 index 0000000000..11a287659c --- /dev/null +++ b/frame/thread/old/bli_thread_range_snake.c @@ -0,0 +1,120 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#if 0 +void bli_thread_range_snake_jr + ( + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ) +{ + // Use snake partitioning of jr loop. + + // NOTE: This function currently assumes that edge cases are handled + // "high" and therefore ignores handle_edge_low. This is because the + // function is only used by gemmt and friends (herk/her2k/syrk/syr2k). + // These operations, unlike trmm/trmm3 and trsm, never require + // low-range edge cases. + + const dim_t tid = bli_thrinfo_work_id( thread ); + const dim_t nt = bli_thrinfo_n_way( thread ); + + const dim_t n_left = n % bf; + const dim_t n_iter = n / bf + ( n_left ? 1 : 0 ); + + if ( bli_is_lower( uplo ) ) + { + // Use the thrinfo_t work id as the thread's starting index. + const dim_t st = tid; + + // This increment will be too big for some threads with only one unit + // (NR columns, or an edge case) of work, but that's okay since all that + // matters is that st + in >= en, which will cause that thread's jr loop + // to not execute beyond the first iteration. + const dim_t in = 2 * ( nt - tid ) - 1; + + dim_t en = st + in + 1; + + // Don't let the thread's end index exceed n_iter. + if ( n_iter < en ) en = n_iter; + + *start = st * bf; + *end = en * bf; // - ( bf - n_left ); + *inc = in * bf; + } + else // if ( bli_is_upper( uplo ) ) + { + dim_t st = n_iter - 2 * nt + tid; + + const dim_t in = 2 * ( nt - tid ) - 1; + + dim_t en = st + in + 1; + + #if 1 + // When nt exceeds half n_iter, some threads will only get one unit + // (NR columns, or an edge case) of work. This manifests as st being + // negative, and thus we need to move their start index to their other + // assigned unit in the positive index range. + if ( st < 0 ) st += in; + + // If the start index is *still* negative, which happens for some + // threads when nt exceeds n_iter, then manually assign this thread + // an empty index range. + if ( st < 0 ) { st = 0; en = 0; } + #else + if ( 0 <= st + in ) { st += in; } + else { st = 0; en = 0; } + #endif + + #if 0 + printf( "thread_range_snake_jr(): tid %d: sta end = %3d %3d %3d\n", + (int)tid, (int)(st), (int)(en), (int)(in) ); + #endif + + *start = st * bf; + *end = en * bf; + *inc = in * bf; + } +} +#endif diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/thread/old/bli_thread_range_snake.h similarity index 70% rename from frame/1m/packm/bli_packm_thrinfo.h rename to frame/thread/old/bli_thread_range_snake.h index 1ac7f88dfb..73fd4ae733 100644 --- a/frame/1m/packm/bli_packm_thrinfo.h +++ b/frame/thread/old/bli_thread_range_snake.h @@ -32,34 +32,22 @@ */ -// -// thrinfo_t macros specific to packm. -// - -/* -#define bli_packm_thread_my_iter( index, thread ) \ -\ - ( index % thread->n_way == thread->work_id % thread->n_way ) -*/ - -#define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \ -\ - ( i % n_way == work_id % n_way ) - -#define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \ -\ - ( start <= i && i < end ) - -// Define a general-purpose version of bli_packm_my_iter() whose definition -// depends on whether slab or round-robin partitioning was requested at -// configure-time. -#ifdef BLIS_ENABLE_JRIR_SLAB - - #define bli_packm_my_iter bli_packm_my_iter_sl - -#else // BLIS_ENABLE_JRIR_RR - - #define bli_packm_my_iter bli_packm_my_iter_rr - +#ifndef BLIS_THREAD_RANGE_SNAKE_H +#define BLIS_THREAD_RANGE_SNAKE_H + +#if 0 +void bli_thread_range_snake_jr + ( + const thrinfo_t* thread, + doff_t diagoff, + uplo_t uplo, + dim_t n, + dim_t bf, + bool handle_edge_low, + dim_t* start, + dim_t* end, + dim_t* inc + ); #endif +#endif diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c index 02f7458adf..b61140743f 100644 --- a/sandbox/gemmlike/bls_gemm_bp_var1.c +++ b/sandbox/gemmlike/bls_gemm_bp_var1.c @@ -344,11 +344,11 @@ void PASTECH2(bls_,ch,varname) \ \ /* Compute the addresses of the next micropanels of A and B. */ \ a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \ - if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) ) \ + if ( bli_is_last_iter_slrr( i, ir_end, ir_tid, ir_nt ) ) \ { \ a2 = a_ic_use; \ b2 = bli_gemm_get_next_b_upanel( b_jr, ps_b_use, 1 ); \ - if ( bli_is_last_iter( j, jr_end, jr_tid, jr_nt ) ) \ + if ( bli_is_last_iter_slrr( j, jr_end, jr_tid, jr_nt ) ) \ b2 = b_pc_use; \ } \ \ diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c index 7c2c4e9a90..b37d34cce3 100644 --- a/sandbox/gemmlike/bls_l3_packm_var1.c +++ b/sandbox/gemmlike/bls_l3_packm_var1.c @@ -131,10 +131,10 @@ void PASTECH2(bls_,ch,varname) \ dim_t it_start, it_end, it_inc; \ \ /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ \ /* Iterate over every logical micropanel in the source matrix. */ \ for ( ic = ic0, it = 0; it < n_iter; \ @@ -147,10 +147,10 @@ void PASTECH2(bls_,ch,varname) \ ctype* restrict c_use = c_begin; \ ctype* restrict p_use = p_begin; \ \ - /* The definition of bli_packm_my_iter() will depend on whether slab + /* The definition of bli_is_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. (The default is slab.) */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ PASTECH2(bls_,ch,packm_cxk) \ ( \ diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c index 94ee0efcd8..b3efbbc28f 100644 --- a/sandbox/gemmlike/bls_l3_packm_var2.c +++ b/sandbox/gemmlike/bls_l3_packm_var2.c @@ -131,10 +131,10 @@ void PASTECH2(bls_,ch,varname) \ dim_t it_start, it_end, it_inc; \ \ /* Determine the thread range and increment using the current thread's - packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir() + packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr() will depend on whether slab or round-robin partitioning was requested at configure-time. */ \ - bli_thread_range_jrir( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ + bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc ); \ \ /* Iterate over every logical micropanel in the source matrix. */ \ for ( ic = ic0, it = 0; it < n_iter; \ @@ -147,10 +147,10 @@ void PASTECH2(bls_,ch,varname) \ ctype* restrict c_use = c_begin; \ ctype* restrict p_use = p_begin; \ \ - /* The definition of bli_packm_my_iter() will depend on whether slab + /* The definition of bli_is_my_iter() will depend on whether slab or round-robin partitioning was requested at configure-time. (The default is slab.) */ \ - if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \ + if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) ) \ { \ /* NOTE: We assume here that kappa = 1 and therefore ignore it. If we're wrong, this will get someone's attention. */ \ diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c index 851102a2ff..8656652b33 100644 --- a/testsuite/src/test_libblis.c +++ b/testsuite/src/test_libblis.c @@ -786,7 +786,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) char impl_str[32]; char def_impl_set_str[32]; char def_impl_unset_str[32]; - char jrir_str[16]; + char jrir_str[32]; const bool has_openmp = bli_info_get_enable_openmp(); const bool has_pthreads = bli_info_get_enable_pthreads(); @@ -821,8 +821,9 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params ) else sprintf( def_impl_set_str, "single" ); // Describe the status of jrir thread partitioning. - if ( bli_info_get_thread_part_jrir_slab() ) sprintf( jrir_str, "slab" ); - else /*bli_info_get_thread_part_jrir_rr()*/ sprintf( jrir_str, "round-robin" ); + if ( bli_info_get_thread_jrir_slab() ) sprintf( jrir_str, "slab" ); + else if ( bli_info_get_thread_jrir_rr() ) sprintf( jrir_str, "round-robin" ); + else /*bli_info_get_thread_jrir_tlb()*/ sprintf( jrir_str, "tile-level (slab)" ); char nt_str[16]; char jc_nt_str[16]; diff --git a/testsuite/src/test_trmm.c b/testsuite/src/test_trmm.c index 0504b33158..497ecf97ea 100644 --- a/testsuite/src/test_trmm.c +++ b/testsuite/src/test_trmm.c @@ -271,7 +271,10 @@ void libblis_test_trmm_impl switch ( iface ) { case BLIS_TEST_SEQ_FRONT_END: +//bli_printm( "a", a, "%5.2f", "" ); +//bli_printm( "b", b, "%5.2f", "" ); bli_trmm( side, alpha, a, b ); +//bli_printm( "b after", b, "%5.2f", "" ); break; default: