Skip to content

Commit

Permalink
Add check for chef last run age
Browse files Browse the repository at this point in the history
Summary:
This diff is part 6 of the stacked changes to introduce a new health-report command in Eden. Specifically, this diff focuses on adding checks to determine *if chef is running properly on user machine**.

This is how we have the contents of chef.last.run_stats across different platforms -
1. Mac
Path: /var/chef/outputs/chef.last_run_stats
Content:
```
{
  "chef.run_success_timestamp": 1643723400,
  "chef.os_success": true,
  "chef.run_success": true,
  "chef.run_start_time": 1643723395,
  "chef.run_end_time": 1643723400,
  "chef.run_elapsed_time": 5,
  "chef.run_updated_resources_count": 10,
  "chef.revision": "1234567890abcdef",
  "chef.destructive_normal": false,
  "chef.destructive_warn": false,
  "chef.destructive_error": false,
  "chef.exception_class": null,
  "chef.exception": null
}
```
2. Linux
Path: /var/chef/outputs/chef.last_run_stats
Content:
```
{
  "chef.run_success_timestamp": 1732645869,
  "chef.os_success": 1,
  "chef.run_success": 1,
  "chef.revision": "f6a94fb83b7790bb5b2cea31747647ff766e5631",
  "chef.run_timestamp": 1732645869,
  "chef.run_start_time": 1732645646,
  "chef.run_end_time": 1732645869,
  "chef.run_elapsed_time": 223,
  "chef.run_updated_resources_count": 92,
  "chef.shard": 97,
  "tags": {
    "chef.org": "o3"
  }
}
```

3. Windows
Path: C:\chef\outputs\chef.last_run_stats
Content:
```
{
  "chef.run_success_timestamp": 1732643948,
  "chef.last_failure_time": 1732644453,
  "chef.last_success_time": 1732643948,
  "chef.run_success": false,
  "chef.run_timestamp": 1732644453,
  "chef.consecutive_failures": 1,
  "chef.run_start_time": 1732644188,
  "chef.run_end_time": 1732644453,
  "chef.run_elapsed_time": 264,
  "chef.run_updated_resources_count": 8
}
```
Across all platforms, the output file contents share a common attribute: chef.run_success_timestamp. This attribute consistently indicates the last Chef run time. Therefore, we will focus on checking this attribute across all platforms to detect any issues related to Chef runs.

### Background on Eden Health Check Subcommand
**Problem Statement**
Currently, when Eden runs within VSCode, we are unable to effectively surface Eden-related issues/failures to users who are unaware that Eden is not healthy on their machine. This can lead to users being unable to use Eden-dependent services like Myles and Sapling.

**Solution**
To address this issue, we propose creating a new subcommand that performs critical checks related to Eden health. This subcommand will be lightweight and can be automated via VSCode periodic runs.

**Critical Checks**
The following critical checks should be included in the subcommand:
- Eden Daemon Not Running: Check if the Eden daemon is running using eden run.
- No Repo Mount Found: Check if any repositories are mounted and if the user is actively developing using Eden-checked out repos.
- Corp/Prod Canal Certificates Missing/Expired: Check if Corp/Prod canal certificates are missing or expired, which would prevent Eden from operating.
- Eden Stale Version: Check if the Eden version is older than 28 days from the current deployed or installed version.
- Last Chef Age/Run: Check if Chef has run recently (ideally every 15 minutes) to ensure that the latest Eden version is installed.

**Benefits**
By implementing this subcommand, we can:
- Surface Eden-related issues/failures to users more effectively
- Improve user engagement with Eden-dependent services like Myles and Sapling
- Reduce the likelihood of users blaming other systems for issues caused by Eden
- Ensure that users have the latest version of Eden installed and running on their machines

Reviewed By: jdelliot

Differential Revision: D66439133

fbshipit-source-id: 63d43079a0860daa79451d2f61a4403d6e7fe387
  • Loading branch information
Vini Gupta authored and facebook-github-bot committed Dec 2, 2024
1 parent b09fef7 commit a2dae52
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 14 deletions.
56 changes: 54 additions & 2 deletions eden/fs/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,19 @@
import sys
import traceback
import typing
from datetime import datetime, timedelta
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple, Type

from eden.fs.cli.util import get_chef_log_path

# Constants
CHEF_LOG_TIMESTAMP_KEY = "chef.run_success_timestamp"
# It is very common for users to get chef warnings on Mondays if this
# time period is shorter than 4 days. So, we report a problem after two weeks.
CHEF_RUN_AGE_PROBLEM = timedelta(days=14)

import thrift.transport
from eden.fs.cli.version import VersionInfo

Expand Down Expand Up @@ -1209,13 +1218,15 @@ class ErrorCode(Enum):
STALE_EDEN_VERSION = 2
INVALID_CERTS = 3
NO_REPO_MOUNT_FOUND = 4
CHEF_NOT_RUNNING = 5

def description(self) -> str:
descriptions = {
self.EDEN_NOT_RUNNING: "The EdenFS daemon is not running.",
self.STALE_EDEN_VERSION: "The running EdenFS daemon is over 30 days out-of-date.",
self.INVALID_CERTS: "The EdenFS couldn't find a valid user certificate.",
self.NO_REPO_MOUNT_FOUND: "One or more checkouts are unmounted: ",
self.CHEF_NOT_RUNNING: "Chef is not running on your machine.",
}
return descriptions[self]

Expand Down Expand Up @@ -1264,7 +1275,7 @@ def is_eden_up_to_date(self) -> bool:
self.error_codes[HealthReportCmd.ErrorCode.STALE_EDEN_VERSION] = (
"Running EdenFS version: "
+ (self.version_info.running_version or "")
+ " installed EdenFS version: "
+ ", installed EdenFS version: "
+ (self.version_info.installed_version or "")
)
return False
Expand Down Expand Up @@ -1299,6 +1310,46 @@ def is_repo_mounted(self, instance: EdenInstance, mounts: List[str]) -> bool:
print(f"Couldn't retrieve EdenFS checkouts info.: {ex}", file=sys.stderr)
return True

def is_chef_running(self) -> bool:
"""Examine the status of Chef."""
chef_log_path = get_chef_log_path(platform.system())
if chef_log_path is None or chef_log_path == "":
print(f"Skipping chef run check for platform {platform.system()}.")
return True
try:
with open(chef_log_path, "r") as f:
chef_log_raw = f.read()
chef_log = json.loads(chef_log_raw)
last_chef_run_sec = chef_log[CHEF_LOG_TIMESTAMP_KEY]

if not isinstance(last_chef_run_sec, (int, float)):
raise ValueError(
f"Invalid/missing timestamp in {CHEF_LOG_TIMESTAMP_KEY}"
)

last_chef_run = datetime.fromtimestamp(last_chef_run_sec)

ms_since_last_run = (
datetime.now() - last_chef_run
).total_seconds() * 1000

if ms_since_last_run >= CHEF_RUN_AGE_PROBLEM.total_seconds() * 1000:
self.error_codes[HealthReportCmd.ErrorCode.CHEF_NOT_RUNNING] = (
"Last run was "
+ str((ms_since_last_run / 3600000))
+ " hours ago."
)
return False
return True
except Exception as e:
self.error_codes[HealthReportCmd.ErrorCode.CHEF_NOT_RUNNING] = (
"Failed to load chef log at "
+ str(chef_log_path)
+ " with error: "
+ str(e.args[0])
)
return False

@staticmethod
def print_error_codes_json(out: ui.Output) -> None:
"""
Expand All @@ -1313,7 +1364,7 @@ def print_error_codes_json(out: ui.Output) -> None:
data = [
{
"error": error_code.name,
"description": error_code.description() + "." + error_additional_info,
"description": error_code.description() + error_additional_info,
}
for error_code, error_additional_info in HealthReportCmd.error_codes.items()
]
Expand All @@ -1335,6 +1386,7 @@ def run(self, args: argparse.Namespace) -> int:
for f in [
self.is_eden_up_to_date,
self.are_certs_valid,
self.is_chef_running,
]
)
):
Expand Down
62 changes: 50 additions & 12 deletions eden/fs/cli/test/health_report_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
# pyre-strict

import argparse
import json
import os

import tempfile
import typing
import unittest
from datetime import datetime
Expand Down Expand Up @@ -71,7 +74,7 @@ def get_version_age(version_str: str) -> int:


class HealthReportTest(unittest.TestCase, TemporaryDirectoryMixin):
def setup(self) -> typing.Tuple[MagicMock, argparse.Namespace]:
def setup(self) -> typing.Tuple[MagicMock, argparse.Namespace, str]:
temp_dir = self.make_temporary_directory()
eden_path = os.path.join(temp_dir, "mount_dir")

Expand All @@ -88,8 +91,34 @@ def setup(self) -> typing.Tuple[MagicMock, argparse.Namespace]:
],
)
mock_argument_parser = MagicMock(spec=argparse.ArgumentParser)
return (mock_argument_parser, args)

# Define the JSON data
data = {
"chef.run_success_timestamp": 1732557634,
"chef.last_failure_time": 1732305759,
"chef.last_success_time": 1732557634,
"chef.run_success": True,
"chef.run_timestamp": 1732557634,
"chef.consecutive_failures": 0,
"chef.run_start_time": 1732547468,
"chef.run_end_time": 1732557624,
"chef.run_elapsed_time": 10155,
"chef.run_updated_resources_count": 27,
}

# Create a temporary file
fd, file_path = tempfile.mkstemp()
try:
# Open the file in write mode
with os.fdopen(fd, "w") as tmp_file:
# Write the JSON data to the file
json.dump(data, tmp_file)
except Exception as e:
print(f"An error occurred: {e}")

return (mock_argument_parser, args, file_path)

@patch("eden.fs.cli.util.get_chef_log_path")
@patch("eden.fs.cli.doctor.facebook.check_x509.find_x509_path")
@patch("eden.fs.cli.doctor.facebook.check_x509.validate_x509")
@patch("eden.fs.cli.config.EdenInstance.get_running_version")
Expand All @@ -102,8 +131,10 @@ def test_calling_into_health_report(
mock_get_running_version: MagicMock,
mock_validate_x509: MagicMock,
mock_find_x509_path: MagicMock,
mock_get_chef_log_path: MagicMock,
) -> None:
mock_argument_parser, args = self.setup()
mock_argument_parser, args, file_path = self.setup()
mock_get_chef_log_path.return_value = file_path
mock_get_running_version.return_value = latest_version
mock_get_version_info.return_value = latest_running_version_info
mock_is_healthy.return_value = True
Expand All @@ -112,15 +143,14 @@ def test_calling_into_health_report(

test_health_report_cmd = HealthReportCmd(mock_argument_parser)
result = test_health_report_cmd.run(args)
self.assertEqual(HealthReportCmd.error_codes, {})
self.assertEqual(result, 0)
self.assertIsNotNone(result)

@patch("eden.fs.cli.util.HealthStatus.is_healthy")
def test_health_report_notify_eden_not_running(
self,
mock_is_healthy: MagicMock,
) -> None:
mock_argument_parser, args = self.setup()
mock_argument_parser, args, file_path = self.setup()
mock_is_healthy.return_value = False

test_health_report_cmd = HealthReportCmd(mock_argument_parser)
Expand All @@ -134,6 +164,7 @@ def test_health_report_notify_eden_not_running(

self.assertEqual(result, 1)

@patch("eden.fs.cli.util.get_chef_log_path")
@patch("eden.fs.cli.doctor.facebook.check_x509.find_x509_path")
@patch("eden.fs.cli.doctor.facebook.check_x509.validate_x509")
@patch("eden.fs.cli.config.EdenInstance.get_running_version")
Expand All @@ -146,8 +177,10 @@ def test_health_report_check_for_stale_eden_version_prompt_error(
mock_get_running_version: MagicMock,
mock_validate_x509: MagicMock,
mock_find_x509_path: MagicMock,
mock_get_chef_log_path: MagicMock,
) -> None:
mock_argument_parser, args = self.setup()
mock_argument_parser, args, file_path = self.setup()
mock_get_chef_log_path.return_value = file_path
mock_get_running_version.return_value = stale_version
mock_get_version_info.return_value = stale_running_version_info
mock_is_healthy.return_value = True
Expand All @@ -159,11 +192,12 @@ def test_health_report_check_for_stale_eden_version_prompt_error(
self.assertEqual(
HealthReportCmd.error_codes,
{
HealthReportCmd.ErrorCode.STALE_EDEN_VERSION: "Running EdenFS version: 20240928-144752 installed EdenFS version: 20241030-165642"
HealthReportCmd.ErrorCode.STALE_EDEN_VERSION: "Running EdenFS version: 20240928-144752, installed EdenFS version: 20241030-165642"
},
)
self.assertEqual(result, 1)

@patch("eden.fs.cli.util.get_chef_log_path")
@patch("eden.fs.cli.doctor.facebook.check_x509.find_x509_path")
@patch("eden.fs.cli.doctor.facebook.check_x509.validate_x509")
@patch("eden.fs.cli.config.EdenInstance.get_running_version")
Expand All @@ -176,8 +210,10 @@ def test_health_report_check_for_stale_eden_version_no_error(
mock_get_running_version: MagicMock,
mock_validate_x509: MagicMock,
mock_find_x509_path: MagicMock,
mock_get_chef_log_path: MagicMock,
) -> None:
mock_argument_parser, args = self.setup()
mock_argument_parser, args, file_path = self.setup()
mock_get_chef_log_path.return_value = file_path
mock_get_running_version.return_value = acceptable_version
mock_get_version_info.return_value = acceptable_running_version_info
mock_is_healthy.return_value = True
Expand All @@ -186,9 +222,9 @@ def test_health_report_check_for_stale_eden_version_no_error(

test_health_report_cmd = HealthReportCmd(mock_argument_parser)
result = test_health_report_cmd.run(args)
self.assertEqual(HealthReportCmd.error_codes, {})
self.assertEqual(result, 0)
self.assertIsNotNone(result)

@patch("eden.fs.cli.util.get_chef_log_path")
@patch("eden.fs.cli.doctor.facebook.check_x509.find_x509_path")
@patch("eden.fs.cli.doctor.facebook.check_x509.validate_x509")
@patch("eden.fs.cli.config.EdenInstance.get_running_version")
Expand All @@ -201,8 +237,10 @@ def test_health_report_check_for_invalid_certs(
mock_get_running_version: MagicMock,
mock_validate_x509: MagicMock,
mock_find_x509_path: MagicMock,
mock_get_chef_log_path: MagicMock,
) -> None:
mock_argument_parser, args = self.setup()
mock_argument_parser, args, file_path = self.setup()
mock_get_chef_log_path.return_value = file_path
mock_find_x509_path.return_value = ("some_cert_path",)
mock_validate_x509.return_value = False
mock_get_running_version.return_value = acceptable_version
Expand Down
14 changes: 14 additions & 0 deletions eden/fs/cli/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ class RepoError(Exception):
PRJFS_MOUNT_PROTOCOL_STRING = "prjfs"

INODE_CATALOG_TYPE_IN_MEMORY_STRING = "inmemory"
CHEF_LOG_PATH_DARWIN = "/var/chef/outputs/chef.last.run_stats"
CHEF_LOG_PATH_LINUX = "/var/chef/outputs/chef.last.run_stats"
CHEF_LOG_PATH_WIN32 = "C:\\chef\\outputs\\chef.last.run_stats"


class EdenStartError(Exception):
Expand Down Expand Up @@ -292,6 +295,17 @@ def check_daemon_health() -> Optional[HealthStatus]:
return poll_until(check_daemon_health, timeout=timeout, timeout_ex=timeout_ex)


def get_chef_log_path(platform: str) -> Optional[str]:
"""Get the path to the Chef log file."""
if platform == "Darwin":
return CHEF_LOG_PATH_DARWIN
elif platform == "Linux":
return CHEF_LOG_PATH_LINUX
elif platform == "Windows":
return CHEF_LOG_PATH_WIN32
return None


def get_home_dir() -> Path:
# NOTE: Path.home() should work on all platforms, but we would want
# to be careful about making that change in case users have muddled with
Expand Down

0 comments on commit a2dae52

Please sign in to comment.