diff --git a/Cargo.toml b/Cargo.toml index 7c0c620..24e4db7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [profile.release] -# debug = true +debug = false lto = "thin" +opt-level = "s" [workspace] diff --git a/editor-support/vscode/syntaxes/lopez-crawl-directive.rion b/editor-support/vscode/syntaxes/lopez-crawl-directive.rion index aa13d95..86f80ba 100644 --- a/editor-support/vscode/syntaxes/lopez-crawl-directive.rion +++ b/editor-support/vscode/syntaxes/lopez-crawl-directive.rion @@ -263,24 +263,32 @@ keywords : context { ruleset_namespace : context { : pattern { - regex \= (first|collect|count|sum) + regex \= (first|collect|distinct|count|sum|group) styles[] = .ruleset_aggregator; } : pattern { - regex \= (name|text|html|inner-html|attr) + regex \= (name|text|html|inner-html|attrs?|classes|id|parent|children|select-any|select-all) styles[] = .ruleset_extractor; } : pattern { - regex \= (is-null|is-not-null|length|hash|get|flatten|each|capture|all-captures) + regex \= (is-null|is-not-null|hash|not|as-number|greater-than|lesser-than|equals|length|is-empty|get|flatten|each|filter|pretty|capture|all-captures|matches|replace|with) styles[] = .ruleset_transformer; } + : pattern { + regex \= (\!explode) + styles[] = .keyword; + } } numeric : context { : pattern { - regex \= (\b\d+) + regex \= (\b[0-9_]+e?[+-]\d+) styles [] = .numeric; } + : pattern { + regex \= (true|false) + styles[] = .keyword; + } } } diff --git a/editor-support/vscode/syntaxes/lopez-crawl-directives.tmLanguage.plist b/editor-support/vscode/syntaxes/lopez-crawl-directives.tmLanguage.plist index ea7ba91..623fe07 100644 --- a/editor-support/vscode/syntaxes/lopez-crawl-directives.tmLanguage.plist +++ b/editor-support/vscode/syntaxes/lopez-crawl-directives.tmLanguage.plist @@ -268,10 +268,6 @@ include #string - - include - #numeric - main__4 @@ -286,10 +282,16 @@ match - (\b\d+) + (\b[0-9_]+e?[\x{002b}-]\d+) name constant.numeric.lcd + + match + (true|false) + name + keyword.lcd + punctuation @@ -318,11 +320,11 @@ match (name|text|html|inner-html|attrs?|classes|id|parent|children|select-any|select-all) name - variable.lcd + entity.name.function.lcd match - (is-null|is-not-null|hash|not|as-number|greater-than|lesser-than|equals|length|get|flatten|each|filter|pretty|capture|all-captures|replace) + (is-null|is-not-null|hash|not|as-number|greater-than|lesser-than|equals|length|is-empty|get|flatten|each|filter|pretty|capture|all-captures|matches|replace|with) name entity.name.function.lcd diff --git a/entalator/src/main.rs b/entalator/src/main.rs index 77b5f3e..a28a28b 100644 --- a/entalator/src/main.rs +++ b/entalator/src/main.rs @@ -6,15 +6,18 @@ use std::{env, fs, io}; const LOPEZ_BIN: &[u8] = include_bytes!("../../target/release/lopez"); const LOPEZ_LIB: Dir = include_dir::include_dir!("../std-lopez"); +const LIB_PATH: &str = "/usr/share/lopez/lib"; +const BIN_PATH: &str = "/usr/local/bin/lopez"; + fn install() -> io::Result<()> { - let lib_path: PathBuf = "/usr/share/lopez/lib".parse().expect("infallible"); - println!("Installing `lopez` to `/usr/local/bin`"); + println!("Installing `lopez` to `{}`", BIN_PATH); - fs::write("/usr/local/bin/lopez", LOPEZ_BIN)?; - fs::set_permissions("/usr/local/bin/lopez", fs::Permissions::from_mode(0o711))?; + fs::write(BIN_PATH, LOPEZ_BIN)?; + fs::set_permissions(BIN_PATH, fs::Permissions::from_mode(0o711))?; - println!("Installing `std-lopez` to `usr/share/lopez`"); + let lib_path: PathBuf = LIB_PATH.parse().expect("infallible"); + println!("Installing `std-lopez` to `{}`", LIB_PATH); println!("Creating folder structure"); diff --git a/lib-lopez/src/crawler/counter.rs b/lib-lopez/src/crawler/counter.rs index d249450..5d38fa6 100644 --- a/lib-lopez/src/crawler/counter.rs +++ b/lib-lopez/src/crawler/counter.rs @@ -123,7 +123,8 @@ impl StatsTracker { self.quota as usize, ), hit_rate: Human( - (self.already_done + self.counter.n_closed() - self.counter.n_error() + (self.already_done + self.counter.n_closed() + - self.counter.n_error() - self .last .as_ref() @@ -216,3 +217,28 @@ impl Display for Stats { Ok(()) } } + +struct Smoother { + last_state: f64, + last_variance: f64, + state_variance: f64, + output_variance: f64, +} + +fn par(a: f64, b: f64) -> f64 { + a * b / (a + b) +} + +impl Smoother { + fn smooth(&mut self, input: f64) -> f64 { + let variance = self.last_variance + self.state_variance; + let new_state = self.last_state + + variance / (variance + self.output_variance) * (input - self.last_state); + let new_variance = par(variance, self.output_variance); + + self.last_state = new_state; + self.last_variance = new_variance; + + new_state + } +} diff --git a/lib-lopez/src/crawler/worker.rs b/lib-lopez/src/crawler/worker.rs index 2038700..e0dbbc7 100644 --- a/lib-lopez/src/crawler/worker.rs +++ b/lib-lopez/src/crawler/worker.rs @@ -397,7 +397,7 @@ impl CrawlWorker { .ensure_error(page_url) .await .map_err(|err| err.into())?; - + // This needs to be the last thing (because of `?`). self.task_counter.register_error(); } @@ -407,7 +407,7 @@ impl CrawlWorker { .ensure_error(page_url) .await .map_err(|err| err.into())?; - + // This needs to be the last thing (because of `?`). self.task_counter.register_error(); } @@ -480,7 +480,7 @@ impl CrawlWorker { // Register close, no matter the status. worker_ref.task_counter.register_closed(); - + // Now, analyze results: if let Err(error) = result { worker_ref.task_counter.register_error(); diff --git a/lib-lopez/src/directives/mod.rs b/lib-lopez/src/directives/mod.rs index ae88611..76af3ec 100644 --- a/lib-lopez/src/directives/mod.rs +++ b/lib-lopez/src/directives/mod.rs @@ -1,6 +1,7 @@ mod aggregator; mod extractor; mod parse; +mod parse_utils; mod transformer; mod value_ext; mod variable; @@ -49,13 +50,18 @@ fn load_items_from<'a, P: AsRef>( module_name: &str, paths: &'a [P], ) -> Result<(&'a P, Vec), String> { + let formatted_module_name = if module_name.is_empty() { + "
" + } else { + module_name + }; + let (path, module_str) = read_from_many(paths) - .map_err(|err| format!("could not open module `{}`: {}", module_name, err))?; + .map_err(|err| format!("could not open module `{}`: {}", formatted_module_name, err))?; let module = parse::entrypoint(&module_str) - .map_err(|err| format!("failed to parse `{}`: {}", module_name, err))? - .1 - .map_err(|err| format!("failed to interpret `{}`: {}", module_name, err))?; + .map_err(|err| format!("failed to parse `{}`: {}", formatted_module_name, err))? + .map_err(|err| format!("failed to interpret `{}`: {}", formatted_module_name, err))?; Ok((path, module)) } @@ -295,20 +301,20 @@ impl Directives { let duplicates = self.find_duplicate_rules(); if !duplicates.is_empty() { issues.push(format!( - "There are duplicated rules in directives: \n\t- {}", - duplicates.into_iter().collect::>().join("\n\t- ") + "There are duplicated rules in directives: \n {}", + duplicates.into_iter().collect::>().join("\n ") )); } let invalid_seeds = self.find_invalid_seeds(); if !invalid_seeds.is_empty() { issues.push(format!( - "There are seeds on the frontier or outside your boundaries: \n\t- {}", + "There are seeds on the frontier or outside your boundaries: \n {}", invalid_seeds .into_iter() .map(|url| url.as_str().to_owned()) .collect::>() - .join("\n\nt- ") + .join("\n ") )); } @@ -316,8 +322,8 @@ impl Directives { if !invalid.is_empty() { issues.push(format!( "There are invalid set-variable definitions \ - (these name are not known): \n\t- {}", - invalid.into_iter().collect::>().join("\n\t- "), + (these name are not known): \n {}", + invalid.into_iter().collect::>().join("\n "), )); } @@ -325,40 +331,43 @@ impl Directives { if !duplicates.is_empty() { issues.push(format!( "There are duplicate set-variable definitions \ - (these definitions are global): \n\t- {}", - duplicates.into_iter().collect::>().join("\n\t- "), + (these definitions are global): \n {}", + duplicates.into_iter().collect::>().join("\n "), )); } let bad_values = self.find_bad_set_variable_values(); if !bad_values.is_empty() { issues.push(format!( - "There are bad values for set-variables: \n\t- {}", + "There are bad values for set-variables: \n {}", bad_values .into_iter() .map(|err| err.to_string()) .collect::>() - .join("\n\nt- "), + .join("\n "), )) } let type_errors = self.find_type_errors(); if !type_errors.is_empty() { issues.push(format!( - "There are type errors for these rules: \n\t- {}", + "There are type errors for these rules: \n {}", type_errors .into_iter() .map(|(name, err)| format!("{}: {}", name, err)) .collect::>() - .join("\n\t- ") + .join("\n ") )) } if !issues.is_empty() { - return Err(issues.join("\n")); + Err(format!( + "There are issues with your configuration: \n{}", + issues.join("\n") + )) + } else { + Ok(()) } - - Ok(()) } /// Loads directives from a given file while also loading all dependencies. diff --git a/lib-lopez/src/directives/parse.rs b/lib-lopez/src/directives/parse.rs index 954c245..50414b7 100644 --- a/lib-lopez/src/directives/parse.rs +++ b/lib-lopez/src/directives/parse.rs @@ -14,6 +14,7 @@ use std::str::FromStr; use url::Url; use super::*; +use super::parse_utils::ParseError; /// Defines end of file (lol!): fn eof(i: &str) -> IResult<&str, ()> { @@ -839,6 +840,8 @@ fn boundary_test() { fn literal(i: &str) -> IResult<&str, Value> { alt(( map(escaped_string, Value::String), + map(tag("true"), |_| true.into()), + map(tag("false"), |_| false.into()), map_res(tuple((digit1, not(tag(".")))), |(number, _): (&str, ())| { number.parse::().map(|num| num.into()) }), @@ -967,11 +970,11 @@ fn item_test() { // )); } -pub fn entrypoint(i: &str) -> IResult<&str, Result, String>> { - all_consuming(map( +pub fn entrypoint(i: &str) -> Result, String>, ParseError> { + ParseError::map_iresult(i, all_consuming(map( tuple((whitespace, many0(trailing_whitespace(item)))), |(_, results)| results.into_iter().collect::, _>>(), - ))(i) + ))(i)) } #[test] @@ -980,6 +983,5 @@ fn entrypoint_test() { "select * { } set foo = \"bar\"; allow \"foo\";\n" )) .unwrap() - .1 .unwrap(); } diff --git a/lib-lopez/src/directives/parse_utils.rs b/lib-lopez/src/directives/parse_utils.rs new file mode 100644 index 0000000..854affe --- /dev/null +++ b/lib-lopez/src/directives/parse_utils.rs @@ -0,0 +1,69 @@ +use nom::error::ErrorKind; +use nom::IResult; +use std::fmt; + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct Position { + line: usize, + column: usize, +} + +impl fmt::Display for Position { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "line {}, column {}", self.line + 1, self.column + 1) + } +} + +impl Position { + fn of(text: &str, fragment: &str) -> Position { + let fragment_pos = text.len() - fragment.len(); + let mut line = 0; + let mut column = 0; + + for ch in text[..fragment_pos].chars() { + if ch == '\n' { + line += 1; + column = 0; + } else if ch != '\r' { + column += 1; + } + } + + Position { line, column } + } +} + +#[derive(Debug)] +pub struct ParseError { + position: Position, + hint: String, + message: String, +} + +impl fmt::Display for ParseError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "at {} ({:?}): {}", self.position, self.hint, self.message) + } +} + +impl ParseError { + pub fn new(text: &str, err: nom::Err<(&str, ErrorKind)>) -> ParseError { + match err { + nom::Err::Error((fragment, error_kind)) | nom::Err::Failure((fragment, error_kind)) => { + ParseError { + position: Position::of(text, fragment), + hint: fragment.lines().map(str::to_owned).next().unwrap_or_default().chars().take(10).collect::() + "...", + message: error_kind.description().to_owned(), + } + } + nom::Err::Incomplete(_) => panic!("incomplete variant no accepted"), + } + } + + pub fn map_iresult(text: &str, iresult: IResult<&str, T>) -> Result { + match iresult { + Ok((_left_over, result)) => Ok(result), + Err(err) => Err(ParseError::new(text, err)), + } + } +} diff --git a/lib-lopez/src/error.rs b/lib-lopez/src/error.rs index 7b3dd56..97f011e 100644 --- a/lib-lopez/src/error.rs +++ b/lib-lopez/src/error.rs @@ -19,7 +19,7 @@ pub enum Error { UnknownContentEncoding(String), #[fail(display = "timed out")] Timeout, - #[fail(display = "bad set-variable value for {}: {:?}", _0, _1)] + #[fail(display = "bad set-variable value for {}: {}", _0, _1)] BadSetVariableValue(crate::directives::Variable, serde_json::Value), #[fail(display = "type error: no type for `{}` of `{}`", _0, _1)] TypeError(String, crate::directives::Type), diff --git a/lib-lopez/src/lib.rs b/lib-lopez/src/lib.rs index a97f330..69ef966 100644 --- a/lib-lopez/src/lib.rs +++ b/lib-lopez/src/lib.rs @@ -55,29 +55,41 @@ macro_rules! main { $crate::cli_impl!($backend_ty); #[tokio::main(basic_scheduler)] - async fn main() -> Result<(), $crate::Error> { + pub async fn main() -> Result<(), $crate::Error> { + use $crate::ansi_term::Color::{Green, Red}; + + match run().await { + Ok(Some(msg)) => println!("{}: {}", Green.bold().paint("ok"), msg), + Ok(None) => {} + Err(err) => println!("{}: {}", Red.bold().paint("error"), err), + } + + Ok(()) + } + + async fn run() -> Result, $crate::Error> { use std::sync::Arc; - use $crate::ansi_term::Color::{Green, Red}; + use $crate::ansi_term::Color::Red; use $crate::backend::Url; use $crate::Directives; + #[cfg(windows)] + let enabled = colored_json::enable_ansi_support(); + // Environment interpretation: let cli = Cli::from_args(); match cli.app { LopezApp::Validate { source } => { // Open directives: - match Directives::load(source, cli.import_path) { - Ok(_directives) => { - println!("{}", Green.bold().paint("Valid configuration")) - } - Err(err) => println!("{}: {}", Red.bold().paint("Error"), err), - } + Directives::load(source, cli.import_path) + .map(|_| Some("valid configuration".to_owned())) + .map_err(|err| err.into()) } LopezApp::Test { source, test_url } => { match Url::parse(&test_url) { - Err(err) => println!("{}: {}", Red.bold().paint("Invalid URL"), err,), + Err(err) => Err(err.into()), Ok(url) => { // Open directives: let directives = Arc::new(Directives::load(source, cli.import_path)?); @@ -89,6 +101,8 @@ macro_rules! main { // Show report (TODO bad representation! make something pretty): report.pretty_print(); + + Ok(None) } } } @@ -109,6 +123,8 @@ macro_rules! main { // Do the thing! $crate::start(Arc::new(profile), directives, backend).await?; + + Ok(Some("crawl complete".to_owned())) } LopezApp::PageRank { wave_name, config } => { // Init logging: @@ -119,16 +135,17 @@ macro_rules! main { // Do the thing. $crate::page_rank(backend).await?; + + Ok(Some("page rank done".to_owned())) } } - - Ok(()) } }; } /// A dummy module only to validate the expansion of the [`main!`] macro /// against the dummy backend. +#[allow(unused)] mod dummy { main! { crate::backend::DummyBackend } }