Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consistent trie sort order #5615

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions utils/zerotrie/benches/overview.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ use zerotrie::ZeroTrieSimpleAscii;
use zerovec::ZeroHashMap;
#[cfg(feature = "bench")]
use zerovec::ZeroMap;
use zerotrie::ByteStr;

mod testdata {
use zerotrie::ByteStr;
include!("../tests/data/data.rs");
}

Expand Down Expand Up @@ -137,7 +139,7 @@ fn get_subtags_bench_large(c: &mut Criterion) {
fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(
mut g: criterion::BenchmarkGroup<M>,
strings: &[&str],
litemap: LiteMap<&[u8], usize>,
litemap: LiteMap<&ByteStr, usize>,
) {
g.bench_function("SimpleAscii", |b| {
let trie = ZeroTrieSimpleAscii::try_from(&litemap).unwrap();
Expand Down Expand Up @@ -171,7 +173,7 @@ fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(

#[cfg(feature = "bench")]
g.bench_function("ZeroMap/usize", |b| {
let zm: ZeroMap<[u8], usize> = litemap.iter().map(|(a, b)| (*a, b)).collect();
let zm: ZeroMap<[u8], usize> = litemap.iter().map(|(a, b)| (a.as_bytes(), b)).collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&zm).get_copied(key.as_bytes());
Expand All @@ -182,7 +184,7 @@ fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(

#[cfg(feature = "bench")]
g.bench_function("ZeroMap/u8", |b| {
let zm: ZeroMap<[u8], u8> = litemap.iter().map(|(k, v)| (*k, *v as u8)).collect();
let zm: ZeroMap<[u8], u8> = litemap.iter().map(|(k, v)| (k.as_bytes(), *v as u8)).collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&zm).get_copied(key.as_bytes());
Expand All @@ -193,7 +195,7 @@ fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(

#[cfg(feature = "bench")]
g.bench_function("HashMap", |b| {
let hm: HashMap<&[u8], usize> = litemap.iter().map(|(a, b)| (*a, *b)).collect();
let hm: HashMap<&[u8], usize> = litemap.iter().map(|(a, b)| (a.as_bytes(), *b)).collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&hm).get(key.as_bytes());
Expand All @@ -206,7 +208,7 @@ fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(
g.bench_function("ZeroHashMap/usize", |b| {
let zhm: ZeroHashMap<[u8], usize> = litemap
.iter()
.map(|(a, b)| (*a, b))
.map(|(a, b)| (a.as_bytes(), b))
.collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
Expand All @@ -220,7 +222,7 @@ fn get_subtags_bench_helper<M: criterion::measurement::Measurement>(

#[cfg(feature = "bench")]
g.bench_function("ZeroHashMap/u8", |b| {
let zhm: ZeroHashMap<[u8], u8> = litemap.iter().map(|(k, v)| (*k, *v as u8)).collect();
let zhm: ZeroHashMap<[u8], u8> = litemap.iter().map(|(k, v)| (k.as_bytes(), *v as u8)).collect();
b.iter(|| {
for (i, key) in black_box(strings).iter().enumerate() {
let actual = black_box(&zhm).get(key.as_bytes()).copied();
Expand Down
2 changes: 1 addition & 1 deletion utils/zerotrie/examples/first_weekday_for_region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ mod weekday {

// This data originated from CLDR 41.
static DATA: &[(&str, usize)] = &[
("001", weekday::MON),
("AD", weekday::MON),
("AE", weekday::SAT),
("AF", weekday::SAT),
Expand Down Expand Up @@ -129,6 +128,7 @@ static DATA: &[(&str, usize)] = &[
("NP", weekday::SUN),
("NZ", weekday::MON),
("OM", weekday::SAT),
("001", weekday::MON),
("PA", weekday::SUN),
("PE", weekday::SUN),
("PH", weekday::SUN),
Expand Down
98 changes: 74 additions & 24 deletions utils/zerotrie/src/builder/bytestr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,62 +2,78 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use core::borrow::Borrow;
use crate::comparison;
use core::cmp::Ordering;
use core::fmt;

#[cfg(feature = "serde")]
use alloc::boxed::Box;

/// A struct transparent over `[u8]` with convenient helper functions.
/// A string key in a ZeroTrie.
///
/// This type has a custom Ord impl, making it suitable for use in a sorted
/// map for ZeroTrie construction.
#[repr(transparent)]
#[derive(PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct ByteStr([u8]);
#[derive(PartialEq, Eq)]
pub struct ByteStr([u8]);

impl ByteStr {
pub const fn from_byte_slice_with_value<'a, 'l>(
#[inline]
pub(crate) const fn from_byte_slice_with_value<'a, 'l>(
input: &'l [(&'a [u8], usize)],
) -> &'l [(&'a ByteStr, usize)] {
// Safety: [u8] and ByteStr have the same layout and invariants
unsafe { core::mem::transmute(input) }
}

pub const fn from_str_slice_with_value<'a, 'l>(
#[inline]
pub(crate) const fn from_str_slice_with_value<'a, 'l>(
input: &'l [(&'a str, usize)],
) -> &'l [(&'a ByteStr, usize)] {
// Safety: str and ByteStr have the same layout, and ByteStr is less restrictive
unsafe { core::mem::transmute(input) }
}

pub fn from_bytes(input: &[u8]) -> &Self {
/// Casts a `&[u8]` to a `&ByteStr`
#[inline]
pub const fn from_bytes(input: &[u8]) -> &Self {
// Safety: [u8] and ByteStr have the same layout and invariants
unsafe { core::mem::transmute(input) }
}

#[cfg(feature = "serde")]
pub fn from_boxed_bytes(input: Box<[u8]>) -> Box<Self> {
/// Casts a `Box<[u8]>` to a `Box<ByteStr>`
#[cfg(feature = "alloc")]
pub const fn from_boxed_bytes(input: Box<[u8]>) -> Box<Self> {
// Safety: [u8] and ByteStr have the same layout and invariants
unsafe { core::mem::transmute(input) }
}

#[allow(dead_code)] // may want this in the future
pub fn from_str(input: &str) -> &Self {
/// Casts a `&str` to a `&ByteStr`
pub const fn from_str(input: &str) -> &Self {
Self::from_bytes(input.as_bytes())
}

#[allow(dead_code)] // may want this in the future
pub fn empty() -> &'static Self {
/// Creates an empty ByteStr
pub const fn empty() -> &'static Self {
Self::from_bytes(&[])
}

#[allow(dead_code)] // not used in all features
/// Returns this ByteStr as a byte slice
pub const fn as_bytes(&self) -> &[u8] {
&self.0
}

/// Whether the ByteStr is an empty slice
pub const fn is_empty(&self) -> bool {
self.len() == 0
}

/// How many bytes are in the ByteStr
pub const fn len(&self) -> usize {
self.0.len()
}

#[allow(dead_code)] // not used in all features
/// Whether the ByteStr is all ASCII-range
pub fn is_all_ascii(&self) -> bool {
for byte in self.0.iter() {
if !byte.is_ascii() {
Expand All @@ -78,13 +94,15 @@ impl ByteStr {
}

/// Const function to evaluate `self < other`.
pub(crate) const fn is_less_then(&self, other: &Self) -> bool {
pub(crate) const fn is_less_than(&self, other: &Self) -> bool {
let mut i = 0;
while i < self.len() && i < other.len() {
if self.0[i] < other.0[i] {
let a = comparison::shift(self.0[i]);
let b = comparison::shift(other.0[i]);
if a < b {
return true;
}
if self.0[i] > other.0[i] {
if a > b {
return false;
}
i += 1;
Expand All @@ -107,15 +125,47 @@ impl ByteStr {
}
}

impl Borrow<[u8]> for ByteStr {
fn borrow(&self) -> &[u8] {
// Note: Does NOT impl Borrow<[u8]> because the Ord impls differ.
// AsRef is okay to implement.

impl AsRef<[u8]> for ByteStr {
fn as_ref(&self) -> &[u8] {
self.as_bytes()
}
}

#[cfg(feature = "alloc")]
impl Borrow<[u8]> for alloc::boxed::Box<ByteStr> {
fn borrow(&self) -> &[u8] {
self.as_bytes()
impl AsRef<ByteStr> for ByteStr {
fn as_ref(&self) -> &ByteStr {
self
}
}

impl<'a> From<&'a str> for &'a ByteStr {
fn from(other: &'a str) -> Self {
ByteStr::from_str(other)
}
}

impl fmt::Debug for ByteStr {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
if let Ok(s) = core::str::from_utf8(self.as_bytes()) {
write!(f, "{s}")
} else {
write!(f, "{:?}", self.as_bytes())
}
}
}

impl Ord for ByteStr {
#[inline]
fn cmp(&self, other: &Self) -> Ordering {
crate::comparison::cmp_slices(&self.0, &other.0)
}
}

impl PartialOrd for ByteStr {
#[inline]
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
2 changes: 1 addition & 1 deletion utils/zerotrie/src/builder/konst/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ impl<const N: usize> ZeroTrieBuilderConst<N> {
match prev {
None => (),
Some(prev) => {
if !prev.is_less_then(ascii_str) {
if !prev.is_less_than(ascii_str) {
panic!("Strings in ByteStr constructor are not sorted");
}
}
Expand Down
18 changes: 8 additions & 10 deletions utils/zerotrie/src/builder/litemap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,33 +15,31 @@ use litemap::LiteMap;

impl ZeroTrieSimpleAscii<Vec<u8>> {
#[doc(hidden)]
pub fn try_from_litemap_with_const_builder<'a, S>(
items: &LiteMap<&'a [u8], usize, S>,
pub fn try_from_litemap_with_const_builder<'a, 'b, S>(
items: &'a LiteMap<&'b ByteStr, usize, S>,
) -> Result<Self, ZeroTrieBuildError>
where
S: litemap::store::StoreSlice<&'a [u8], usize, Slice = [(&'a [u8], usize)]>,
S: litemap::store::StoreSlice<&'b ByteStr, usize, Slice = [(&'b ByteStr, usize)]>,
{
let tuples = items.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(tuples);
let byte_str_slice = items.as_slice();
ZeroTrieBuilderConst::<10000>::from_sorted_const_tuple_slice::<100>(byte_str_slice.into())
.map(|s| Self {
store: s.as_bytes().to_vec(),
})
}
}

impl<'a, K, S> TryFrom<&'a LiteMap<K, usize, S>> for ZeroTrie<Vec<u8>>
impl<'a, 'b, K, S> TryFrom<&'a LiteMap<K, usize, S>> for ZeroTrie<Vec<u8>>
where
// Borrow, not AsRef, because we rely on Ord being the same. Unfortunately
// this means `LiteMap<&str, usize>` does not work.
K: Borrow<[u8]>,
K: Borrow<ByteStr>,
S: litemap::store::StoreSlice<K, usize, Slice = [(K, usize)]>,
{
type Error = ZeroTrieBuildError;
fn try_from(items: &LiteMap<K, usize, S>) -> Result<Self, ZeroTrieBuildError> {
let byte_litemap = items.to_borrowed_keys::<[u8], Vec<_>>();
let byte_slice = byte_litemap.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(byte_slice);
let byte_litemap = items.to_borrowed_keys::<ByteStr, Vec<_>>();
let byte_str_slice = byte_litemap.as_slice();
Self::try_from_tuple_slice(byte_str_slice)
}
}
Expand Down
2 changes: 1 addition & 1 deletion utils/zerotrie/src/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ mod litemap;
#[cfg(feature = "alloc")]
pub(crate) mod nonconst;

use bytestr::ByteStr;
pub use bytestr::ByteStr;

use super::ZeroTrieSimpleAscii;

Expand Down
55 changes: 16 additions & 39 deletions utils/zerotrie/src/builder/nonconst/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ use crate::byte_phf::PerfectByteHashMapCacheOwned;
use crate::error::ZeroTrieBuildError;
use crate::options::*;
use crate::varint;
use alloc::borrow::Cow;
use alloc::vec::Vec;

/// A low-level builder for ZeroTrie. Supports all options.
Expand Down Expand Up @@ -101,12 +100,11 @@ impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
let items = Vec::<(K, usize)>::from_iter(iter);
let mut items = items
.iter()
.map(|(k, v)| (k.as_ref(), *v))
.collect::<Vec<(&[u8], usize)>>();
items.sort_by(|a, b| cmp_keys_values(&options, *a, *b));
let ascii_str_slice = items.as_slice();
let byte_str_slice = ByteStr::from_byte_slice_with_value(ascii_str_slice);
Self::from_sorted_tuple_slice_impl(byte_str_slice, options)
.map(|(k, v)| (ByteStr::from_bytes(k.as_ref()), *v))
.collect::<Vec<(&ByteStr, usize)>>();
items.sort_by(|a, b| cmp_keys_values(*a, *b));
let byte_str_slice = items.as_slice();
Self::from_sorted_tuple_slice(byte_str_slice, options)
}

/// Builds a ZeroTrie with the given items and options. Assumes that the items are sorted,
Expand All @@ -118,29 +116,16 @@ impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
pub fn from_sorted_tuple_slice(
items: &[(&ByteStr, usize)],
options: ZeroTrieBuilderOptions,
) -> Result<Self, ZeroTrieBuildError> {
let mut items = Cow::Borrowed(items);
if matches!(options.case_sensitivity, CaseSensitivity::IgnoreCase) {
// We need to re-sort the items with our custom comparator.
items.to_mut().sort_by(|a, b| {
cmp_keys_values(&options, (a.0.as_bytes(), a.1), (b.0.as_bytes(), b.1))
});
}
Self::from_sorted_tuple_slice_impl(&items, options)
}

/// Internal constructor that does not re-sort the items.
fn from_sorted_tuple_slice_impl(
items: &[(&ByteStr, usize)],
options: ZeroTrieBuilderOptions,
) -> Result<Self, ZeroTrieBuildError> {
for ab in items.windows(2) {
debug_assert!(cmp_keys_values(
&options,
(ab[0].0.as_bytes(), ab[0].1),
(ab[1].0.as_bytes(), ab[1].1)
)
.is_lt());
debug_assert!(
cmp_keys_values(
(&ab[0].0, ab[0].1),
(&ab[1].0, ab[1].1)
)
.is_lt(),
"{ab:?}"
);
}
let mut result = Self {
data: S::atbs_new_empty(),
Expand Down Expand Up @@ -403,16 +388,8 @@ impl<S: TrieBuilderStore> ZeroTrieBuilder<S> {
}

fn cmp_keys_values(
options: &ZeroTrieBuilderOptions,
a: (&[u8], usize),
b: (&[u8], usize),
a: (&ByteStr, usize),
b: (&ByteStr, usize),
) -> Ordering {
if matches!(options.case_sensitivity, CaseSensitivity::Sensitive) {
a.0.cmp(b.0)
} else {
let a_iter = a.0.iter().map(|x| x.to_ascii_lowercase());
let b_iter = b.0.iter().map(|x| x.to_ascii_lowercase());
Iterator::cmp(a_iter, b_iter)
}
.then_with(|| a.1.cmp(&b.1))
a.0.cmp(b.0).then_with(|| a.1.cmp(&b.1))
}
Loading