Skip to content

Commit

Permalink
status: add fallback to slow crawl on watchman failure
Browse files Browse the repository at this point in the history
Summary:
The Python "status" code would fall back to non-Watchman status on a Watchman error by default. Previously I didn't reproduce this in the Rust status, hoping it wasn't necessary. However, various cases have popped up where Watchman can't be used in various environments for various reasons, so let's add the fallback to Rust as well.

Note that some of the Watchman errors have been real bugs in Watchman (e.g. startup race condition in Windows), and potential "environment" bugs (e.g. Watchman simply not installed), but it does seem like we should err on the side of not blocking users with errors. I'm going to disable fsmonitor.fallback-on-watchman-exception for tests and local development so we don't hide new issues.

Reviewed By: quark-zju

Differential Revision: D51265962

fbshipit-source-id: d66dd47e83f4bd3811f04d237a75148030cc8526
  • Loading branch information
muirdm authored and facebook-github-bot committed Nov 13, 2023
1 parent 995db0d commit 315f9e3
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 46 deletions.
146 changes: 100 additions & 46 deletions eden/scm/lib/workingcopy/src/watchmanfs/watchmanfs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,52 +156,7 @@ impl WatchmanFileSystem {

Ok(result)
}
}

async fn crawl_progress(root: PathBuf, approx_file_count: u64) -> Result<()> {
let client = {
let _bar = ProgressBar::new_detached("connecting watchman", 0, "");

// If watchman just started (and we issued "watch-project" from
// query_files), this connect gets stuck indefinitely. Work around by
// timing out and retrying until we get through.
loop {
match tokio::time::timeout(Duration::from_secs(1), Connector::new().connect()).await {
Ok(client) => break client?,
Err(_) => {}
};

tokio::time::sleep(Duration::from_secs(1)).await;
}
};

let mut bar = None;

let req = DebugRootStatusRequest(
"debug-root-status",
CanonicalPath::canonicalize(root)?.into_path_buf(),
);

loop {
let response: DebugRootStatusResponse = client.generic_request(req.clone()).await?;

if let Some(RootStatus {
recrawl_info: Some(RecrawlInfo { stats: Some(stats) }),
}) = response.root_status
{
bar.get_or_insert_with(|| {
ProgressBar::new_detached("crawling", approx_file_count, "files (approx)")
})
.set_position(stats);
} else if bar.is_some() {
return Ok(());
}

tokio::time::sleep(Duration::from_millis(100)).await;
}
}

impl FileSystem for WatchmanFileSystem {
#[tracing::instrument(skip_all)]
fn pending_changes(
&self,
Expand Down Expand Up @@ -265,11 +220,14 @@ impl FileSystem for WatchmanFileSystem {
})?,
},
ignore_dirs,
))?
))
};

// Make sure we always abort - even in case of error.
progress_handle.abort();

let result = result?;

tracing::debug!(
target: "watchman_info",
watchmanfreshinstances= if result.is_fresh_instance { 1 } else { 0 },
Expand Down Expand Up @@ -397,6 +355,102 @@ impl FileSystem for WatchmanFileSystem {

Ok(Box::new(pending_changes.into_iter()))
}
}

async fn crawl_progress(root: PathBuf, approx_file_count: u64) -> Result<()> {
let client = {
let _bar = ProgressBar::new_detached("connecting watchman", 0, "");

// If watchman just started (and we issued "watch-project" from
// query_files), this connect gets stuck indefinitely. Work around by
// timing out and retrying until we get through.
loop {
match tokio::time::timeout(Duration::from_secs(1), Connector::new().connect()).await {
Ok(client) => break client?,
Err(_) => {}
};

tokio::time::sleep(Duration::from_secs(1)).await;
}
};

let mut bar = None;

let req = DebugRootStatusRequest(
"debug-root-status",
CanonicalPath::canonicalize(root)?.into_path_buf(),
);

loop {
let response: DebugRootStatusResponse = client.generic_request(req.clone()).await?;

if let Some(RootStatus {
recrawl_info: Some(RecrawlInfo { stats: Some(stats) }),
}) = response.root_status
{
bar.get_or_insert_with(|| {
ProgressBar::new_detached("crawling", approx_file_count, "files (approx)")
})
.set_position(stats);
} else if bar.is_some() {
return Ok(());
}

tokio::time::sleep(Duration::from_millis(100)).await;
}
}

impl FileSystem for WatchmanFileSystem {
fn pending_changes(
&self,
matcher: DynMatcher,
ignore_matcher: DynMatcher,
ignore_dirs: Vec<PathBuf>,
include_ignored: bool,
last_write: SystemTime,
config: &dyn Config,
io: &IO,
) -> Result<Box<dyn Iterator<Item = Result<PendingChange>>>> {
let result = self.pending_changes(
matcher.clone(),
ignore_matcher.clone(),
ignore_dirs.clone(),
include_ignored,
last_write,
config,
io,
);

match result {
Ok(result) => Ok(result),
Err(err) if err.is::<watchman_client::Error>() => {
if !config.get_or("fsmonitor", "fallback-on-watchman-exception", || true)? {
return Err(err);
}

// On watchman error, fall back to manual walk. This is important for errors such as:
// - "watchman" binary not in PATH
// - unsupported filesystem (e.g. NFS)
//
// A better approach might be an allowlist of errors to fall
// back on so we can fail hard in cases where watchman "should"
// work, but that is probably still an unacceptable UX in general.

tracing::debug!(target: "watchman_info", watchmanfallback=1);
tracing::warn!(?err, "watchman error - falling back to slow crawl");
self.inner.pending_changes(
matcher,
ignore_matcher,
ignore_dirs,
include_ignored,
last_write,
config,
io,
)
}
Err(err) => Err(err),
}
}

fn sparse_matcher(
&self,
Expand Down
24 changes: 24 additions & 0 deletions eden/scm/tests/test-fsmonitor-fallback.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#debugruntest-compatible
#require fsmonitor

$ configure modernclient
$ newclientrepo
$ echo foo > foo
$ hg commit -Aqm foo

$ echo nope > $TESTTMP/watchman
$ chmod +x $TESTTMP/watchman
$ export PATH=$TESTTMP:$PATH
$ unset WATCHMAN_SOCK

$ echo foo >> foo
$ LOG=warn,watchman_info=debug hg st --config fsmonitor.fallback-on-watchman-exception=true
DEBUG watchman_info: watchmanfallback=1
WARN workingcopy::watchmanfs::watchmanfs: watchman error - falling back to slow crawl * (glob)
` (?)
M foo

$ LOG=warn,watchman_info=debug hg st --config fsmonitor.fallback-on-watchman-exception=false
abort: While invoking the watchman CLI * (glob)
` (?)
[255]

0 comments on commit 315f9e3

Please sign in to comment.