diff --git a/include/Sieve.hpp b/include/Sieve.hpp index 58b1830e..51b4685f 100644 --- a/include/Sieve.hpp +++ b/include/Sieve.hpp @@ -255,32 +255,33 @@ class Sieve uint64_t m2 = unset_larger[stop % 240]; const uint64_t* sieve64 = (const uint64_t*) sieve_.data(); - if (start_idx == stop_idx) - return popcnt64(sieve64[start_idx] & m1 & m2); - else - { - uint64_t i = start_idx + 1; - uint64_t start_bits = sieve64[start_idx] & m1; - uint64_t stop_bits = sieve64[stop_idx] & m2; - uint64_t cnt = popcnt64(start_bits); - cnt += popcnt64(stop_bits); - svuint64_t vcnt = svdup_u64(0); - - // Compute this for loop using ARM SVE. - // for (i = start_idx + 1; i < stop_idx; i++) - // cnt += popcnt64(sieve64[i]); - do { - svbool_t pg = svwhilelt_b64(i, stop_idx); - svuint64_t vec = svld1_u64(pg, &sieve64[i]); - vec = svcnt_u64_z(pg, vec); - vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec); - i += svcntd(); - } - while (i < stop_idx); - cnt += svaddv_u64(svptrue_b64(), vcnt); + // Branchfree bitmask calculation: + // m1 = (start_idx != stop_idx) ? m1 : m1 & m2; + m1 = (m1 * (start_idx != stop_idx)) | ((m1 & m2) * (start_idx == stop_idx)); + // m2 = (start_idx != stop_idx) ? m2 : 0; + m2 *= (start_idx != stop_idx); - return cnt; + uint64_t i = start_idx + 1; + uint64_t start_bits = sieve64[start_idx] & m1; + uint64_t stop_bits = sieve64[stop_idx] & m2; + uint64_t cnt = popcnt64(start_bits); + cnt += popcnt64(stop_bits); + svuint64_t vcnt = svdup_u64(0); + + // Compute this for loop using ARM SVE. + // for (i = start_idx + 1; i < stop_idx; i++) + // cnt += popcnt64(sieve64[i]); + do { + svbool_t pg = svwhilelt_b64(i, stop_idx); + svuint64_t vec = svld1_u64(pg, &sieve64[i]); + vec = svcnt_u64_z(pg, vec); + vcnt = svadd_u64_x(svptrue_b64(), vcnt, vec); + i += svcntd(); } + while (i < stop_idx); + cnt += svaddv_u64(svptrue_b64(), vcnt); + + return cnt; } #endif