diff --git a/src/tokenizer/chain.rs b/src/tokenizer/chain.rs new file mode 100644 index 0000000000..f52a912296 --- /dev/null +++ b/src/tokenizer/chain.rs @@ -0,0 +1,147 @@ +use tokenizer_api::{Token, TokenStream, Tokenizer}; + +/// A tokenizer running through the first tokenizer and then through the second. +#[derive(Clone)] +pub struct ChainTokenizer { + first: F, + second: S, +} + +impl ChainTokenizer +where + F: Tokenizer, + S: Tokenizer, +{ + /// Create a new tokenzier, chaining the two provided ones. + pub fn new(first: F, second: S) -> Self { + Self { first, second } + } +} + +impl Tokenizer for ChainTokenizer +where + F: Tokenizer, + S: Tokenizer, +{ + type TokenStream<'a> = ChainTokenStream<'a, F, S>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + ChainTokenStream { + first: Some(self.first.token_stream(text)), + second: self.second.token_stream(text), + } + } +} + +pub struct ChainTokenStream<'a, F, S> +where + F: Tokenizer, + S: Tokenizer, +{ + first: Option>, + second: S::TokenStream<'a>, +} + +impl<'a, F, S> TokenStream for ChainTokenStream<'a, F, S> +where + F: Tokenizer, + S: Tokenizer, +{ + fn advance(&mut self) -> bool { + if let Some(first) = &mut self.first { + if first.advance() { + return true; + } else { + self.first = None; + } + } + + self.second.advance() + } + + fn token(&self) -> &Token { + match &self.first { + Some(first) => first.token(), + None => self.second.token(), + } + } + + fn token_mut(&mut self) -> &mut Token { + match &mut self.first { + Some(first) => first.token_mut(), + None => self.second.token_mut(), + } + } +} + +#[cfg(test)] +mod tests { + use tokenizer_api::TokenFilter; + + use super::*; + use crate::tokenizer::empty_tokenizer::EmptyTokenizer; + use crate::tokenizer::{LowerCaser, RawTokenizer, SimpleTokenizer, TokenizerExt}; + + fn assert_chain<'a>( + first: impl Tokenizer, + second: impl Tokenizer, + input: &str, + expected: impl IntoIterator, + ) { + let mut chain = ChainTokenizer::new(first, second); + let mut stream = chain.token_stream(input); + let mut result = vec![]; + while let Some(token) = stream.next() { + result.push(token.text.to_string()); + } + let expected = expected.into_iter().collect::>(); + assert_eq!(expected, result); + } + + #[test] + fn test_empty() { + assert_chain(EmptyTokenizer, EmptyTokenizer, "", []); + } + + #[test] + fn test_simple() { + assert_chain( + SimpleTokenizer::default(), + LowerCaser.transform(SimpleTokenizer::default()), + "Foo Bar Baz", + ["Foo", "Bar", "Baz", "foo", "bar", "baz"], + ); + } + + #[test] + fn test_empty_simple() { + assert_chain( + EmptyTokenizer, + SimpleTokenizer::default(), + "Foo Bar Baz", + ["Foo", "Bar", "Baz"], + ); + } + + #[test] + fn test_simple_empty() { + assert_chain( + SimpleTokenizer::default(), + EmptyTokenizer, + "Foo Bar Baz", + ["Foo", "Bar", "Baz"], + ); + } + + #[test] + fn test_chain_twice() { + assert_chain( + SimpleTokenizer::default(), + LowerCaser + .transform(SimpleTokenizer::default()) + .chain(RawTokenizer::default()), + "FOO BAR BAZ", + ["FOO", "BAR", "BAZ", "foo", "bar", "baz", "FOO BAR BAZ"], + ); + } +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 23f7893d29..d272cdf6c4 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -123,6 +123,7 @@ //! ``` mod alphanum_only; mod ascii_folding_filter; +mod chain; mod empty_tokenizer; mod facet_tokenizer; mod lower_caser; @@ -143,6 +144,7 @@ pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokeniz pub use self::alphanum_only::AlphaNumOnlyFilter; pub use self::ascii_folding_filter::AsciiFoldingFilter; +pub use self::chain::ChainTokenizer; pub use self::facet_tokenizer::FacetTokenizer; pub use self::lower_caser::LowerCaser; pub use self::ngram_tokenizer::NgramTokenizer; @@ -165,6 +167,18 @@ pub use self::whitespace_tokenizer::WhitespaceTokenizer; /// `2^16 - 1 - 5`, the token will simply be ignored downstream. pub const MAX_TOKEN_LEN: usize = u16::MAX as usize - 5; +/// A trait to extend [`Tokenizer`]s with additional functionality. +pub trait TokenizerExt: Sized { + /// Produce a [`Tokenizer`] which runs through the first tokenizer, and then through the second. + fn chain(self, next: T) -> ChainTokenizer; +} + +impl TokenizerExt for F { + fn chain(self, second: T) -> ChainTokenizer { + ChainTokenizer::new(self, second) + } +} + #[cfg(test)] pub mod tests { use super::{