Skip to content

Commit

Permalink
Merge branch 'master' into shahzeb/status-panel-lib-updates
Browse files Browse the repository at this point in the history
  • Loading branch information
mshahzeb committed Dec 13, 2024
2 parents 79ab22b + 0e82e75 commit 70ec4cd
Show file tree
Hide file tree
Showing 20 changed files with 184 additions and 52 deletions.
8 changes: 4 additions & 4 deletions common-lib/common/panels/disk/table/usage.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -237,13 +237,13 @@ base {

+ table.standardOptions.withOverrides([
fieldOverride.byName.new('Mounted on')
+ fieldOverride.byName.withProperty('custom.width', '260'),
+ fieldOverride.byName.withProperty('custom.width', 260),
fieldOverride.byName.new('Size')
+ fieldOverride.byName.withProperty('custom.width', '80'),
+ fieldOverride.byName.withProperty('custom.width', 80),
fieldOverride.byName.new('Used')
+ fieldOverride.byName.withProperty('custom.width', '80'),
+ fieldOverride.byName.withProperty('custom.width', 80),
fieldOverride.byName.new('Available')
+ fieldOverride.byName.withProperty('custom.width', '80'),
+ fieldOverride.byName.withProperty('custom.width', 80),
fieldOverride.byName.new('Used, %')
+ fieldOverride.byName.withProperty(
'custom.cellOptions', {
Expand Down
13 changes: 9 additions & 4 deletions common-lib/common/variables/variables.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,11 @@ local utils = import '../utils.libsonnet';
prometheusDatasourceName=if enableLokiLogs then 'prometheus_datasource' else 'datasource',
prometheusDatasourceLabel=if enableLokiLogs then 'Prometheus data source' else 'Data source',
): {
// strip trailing or starting comma if present:
// while trailing comma is accepted in PromQL expressions, starting comma is not.
// starting comma can be present in case of concatenation of empty filteringSelector with some extra selectors.
local _filteringSelector = std.stripChars(std.stripChars(filteringSelector, ' '), ','),

local varMetricTemplate(varMetric, chainSelector) =
// check if chainSelector is not empty string (case when filtering selector is empty):
if std.type(varMetric) == 'array' && chainSelector != ''
Expand Down Expand Up @@ -57,24 +62,24 @@ local utils = import '../utils.libsonnet';
// Use on dashboards where multiple entities can be selected, like fleet dashboards
multiInstance:
[root.datasources.prometheus]
+ variablesFromLabels(groupLabels, instanceLabels, filteringSelector),
+ variablesFromLabels(groupLabels, instanceLabels, _filteringSelector),
// Use on dashboards where only single entity can be selected
singleInstance:
[root.datasources.prometheus]
+ variablesFromLabels(groupLabels, instanceLabels, filteringSelector, multiInstance=false),
+ variablesFromLabels(groupLabels, instanceLabels, _filteringSelector, multiInstance=false),
queriesSelectorAdvancedSyntax:
std.join(
',',
std.filter(function(x) std.length(x) > 0, [
filteringSelector,
_filteringSelector,
utils.labelsToPromQLSelectorAdvanced(groupLabels + instanceLabels),
])
),
queriesSelector:
std.join(
',',
std.filter(function(x) std.length(x) > 0, [
filteringSelector,
_filteringSelector,
utils.labelsToPromQLSelector(groupLabels + instanceLabels),
])
),
Expand Down
4 changes: 2 additions & 2 deletions csp-mixin/alerts/azure-alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ groups:
annotations:
summary: 'VM CPU utilization is too high.'
description: 'The VM {{ $labels.resourceName }} is under heavy load and may become unresponsive.'
dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e'
dashboard_url: '/a/grafana-csp-app/azure/dashboards/vm'

- alert: AzureVMUnavailable
expr: |
Expand All @@ -27,7 +27,7 @@ groups:
annotations:
summary: 'VM unavailable.'
description: 'The VM {{ $labels.resourceName }} is not functioning or crashed, which may require immediate action.'
dashboard_uid: '58f33c50e66c911b0ad8a25aa438a96e'
dashboard_url: '/a/grafana-csp-app/azure/dashboards/vm'

- alert: AzureDatabaseHighDtuConsumption
expr: |
Expand Down
128 changes: 127 additions & 1 deletion csp-mixin/alerts/gcp-alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,132 @@ groups:
service: 'Compute Engine'
namespace: cloud-provider-gcp
annotations:
summary: 'CPU utilization is too high.'
summary: 'VM CPU utilization is too high.'
description: 'The VM {{ $labels.instance_name }} is under heavy load and may become unresponsive.'
dashboard_uid: 'f115fe73641347c43415535d77e2dc0f'

- alert: GcpCEHighIOLatency
expr: |
avg by (job,project_id,instance_id)(stackdriver_gce_instance_compute_googleapis_com_instance_disk_average_io_latency{job=~".+",project_id=~".+",instance_id=~".+"}) > 5000
for: 5m
keep_firing_for: 10m
labels:
severity: critical
service: 'Compute Engine'
namespace: cloud-provider-gcp
annotations:
summary: 'VM IO latency is too high.'
description: 'Check {{ $labels.instance_id }} VM for I/O bottlenecks and upgrade to SSD if necessary.'
dashboard_uid: 'f115fe73641347c43415535d77e2dc0f'

- alert: GcpCloudSQLHighCpu
expr: |
100 * avg by (job,project_id,instance,database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_cpu_utilization{job=~".+",project_id=~".+",instance=~".+", database_id=~".+"}) > 90
for: 5m
keep_firing_for: 10m
labels:
severity: critical
service: 'Cloud SQL'
namespace: cloud-provider-gcp
annotations:
summary: 'Database CPU utilization is too high.'
description: 'Check {{ $labels.database_id }} database for high CPU queries and optimize them, or scale up the instance if sustained high usage.'
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'

- alert: GcpCloudSQLMemoryUsage
expr: |
100 * avg by (job,project_id,instance,database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_memory_utilization{job=~".+",project_id=~".+",instance=~".+", database_id=~".+"}) > 85
for: 5m
keep_firing_for: 10m
labels:
severity: critical
service: 'Cloud SQL'
namespace: cloud-provider-gcp
annotations:
summary: 'Database memory utilization is too high.'
description: 'Review high-memory queries or add more memory to the {{ $labels.database_id }} instance.'
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'

- alert: GcpCloudSQLDiskUsage
expr: |
100 * avg by (job,project_id,instance,database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_disk_utilization{job=~".+",project_id=~".+",instance=~".+", database_id=~".+"}) > 85
for: 5m
keep_firing_for: 10m
labels:
severity: critical
service: 'Cloud SQL'
namespace: cloud-provider-gcp
annotations:
summary: 'Database disk utilization is too high.'
description: 'Delete or archive unused data, or increase disk size to the {{ $labels.database_id }} database.'
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'

- alert: GcpCloudSQLActiveConnections
expr: |
avg by (job,project_id,instance, database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_mysql_threads{thread_kind="THREADS_CONNECTED", job=~".+",project_id=~".+",instance=~".+", database_id=~".+"}) > 0.9 * avg by (job,project_id,instance, database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_mysql_max_connections{job=~".+",project_id=~".+",instance=~".+", database_id=~".+"})
for: 5m
keep_firing_for: 10m
labels:
severity: critical
service: 'Cloud SQL'
namespace: cloud-provider-gcp
annotations:
summary: 'Too many database active connections.'
description: 'Investigate connection pooling settings and connection management in your {{ $labels.database_id }} application database.'
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'

- alert: GcpCloudSQLAbortedConnections
expr: |
sum by(job, instance, project_id)(rate(stackdriver_cloudsql_database_cloudsql_googleapis_com_database_mysql_aborted_connects_count[5m])) > 5
for: 5m
keep_firing_for: 10m
labels:
severity: critical
service: 'Cloud SQL'
namespace: cloud-provider-gcp
annotations:
summary: 'More than 5 MySQL failed connections in 5 minutes.'
description: 'Verify credentials and network settings; check for firewall rules blocking connections for the {{ $labels.database_id }} database.'
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'

- alert: GcpCloudSQLLagSecondsBehindMaster
expr: |
avg by (job,project_id,instance, database_id) (stackdriver_cloudsql_database_cloudsql_googleapis_com_database_mysql_replication_seconds_behind_master) > 5
for: 5m
keep_firing_for: 10m
labels:
severity: warning
service: 'Cloud SQL'
namespace: cloud-provider-gcp
annotations:
summary: 'More than 5 seconds lag between database read replica and primary.'
description: 'Check {{ $labels.database_id }} database for network latency between primary and replica; adjust configurations to optimize replication.'
dashboard_uid: 'cc710d49022fdd69bed0e992891863e9'

- alert: GcpPubSubNumUndeliveredMessages
expr: |
avg by (job,project_id,instance)(stackdriver_pubsub_subscription_pubsub_googleapis_com_subscription_num_undelivered_messages{job=~".+",project_id=~".+",instance=~".+"}) > 1000
for: 5m
keep_firing_for: 10m
labels:
severity: warning
service: 'Pub/Sub'
namespace: cloud-provider-gcp
annotations:
summary: 'More than 1000 unacknowledged messages for a PubSub subscription.'
description: 'Scale up subscribers or adjust message processing capacity for the {{ $labels.instance }} instance.'
dashboard_uid: '2abad1eb5e4873b95e9176e7ef10a30c'

- alert: GcpPubSubUnackedMessageAge
expr: |
avg by (job,project_id,instance)(stackdriver_pubsub_subscription_pubsub_googleapis_com_subscription_oldest_unacked_message_age{job=~".+",project_id=~".+",instance=~".+"}) > 60
for: 5m
keep_firing_for: 10m
labels:
severity: warning
service: 'Pub/Sub'
namespace: cloud-provider-gcp
annotations:
summary: 'Unacknowledged messages for more than 60 seconds for a PubSub subscription.'
description: 'Investigate {{ $labels.instance }} instance and speed up message processing; ensure consumers can handle the load.'
dashboard_uid: '2abad1eb5e4873b95e9176e7ef10a30c'
17 changes: 6 additions & 11 deletions jvm-mixin/.lint
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
exclusions:
panel-title-description-rule:
reason: "mixtool upgrade made this rule stricter. TODO: Fix errors and remove the warning exclusion"
panel-datasource-rule:
entries:
- panel: GC duration
- panel: Allocated/promoted
panel-units-rule:
reason: "mixtool upgrade made this rule stricter. TODO: Fix errors and remove the warning exclusion"
template-datasource-rule:
reason: "mixtool upgrade made this rule stricter. TODO: Fix errors and remove the warning exclusion"
template-instance-rule:
reason: "mixtool upgrade made this rule stricter. TODO: Fix errors and remove the warning exclusion"
template-job-rule:
reason: "mixtool upgrade made this rule stricter. TODO: Fix errors and remove the warning exclusion"
template-on-time-change-reload-rule:
reason: "mixtool upgrade made this rule stricter. TODO: Fix errors and remove the warning exclusion"
entries:
- panel: Process files open
2 changes: 1 addition & 1 deletion jvm-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
filteringSelector: 'job!=""',
filteringSelector: '',
groupLabels: ['job'],
instanceLabels: ['instance'],
uid: 'jvm',
Expand Down
2 changes: 1 addition & 1 deletion jvm-observ-lib/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
local this = self,
filteringSelector: 'job!=""',
filteringSelector: '', // set to apply static filters to all queries and alerts, i.e. job="bar"
groupLabels: ['job'],
instanceLabels: ['instance'],
uid: 'jvm',
Expand Down
2 changes: 1 addition & 1 deletion kafka-observ-lib/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
local this = self,
filteringSelector: 'job!=""',
filteringSelector: '', // set to apply static filters to all queries and alerts, i.e. job="integrations/kafka"
zookeeperfilteringSelector: this.filteringSelector,
groupLabels: ['kafka_cluster'], // label(s) that defines kafka cluster
instanceLabels: ['instance'], // label(s) that defines single broker
Expand Down
2 changes: 1 addition & 1 deletion kafka-observ-lib/signals/consumerGroup.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ local commonlib = import 'common-lib/common/main.libsonnet';

function(this)
{
filteringSelector: std.join(',', [this.filteringSelector, this.topicsFilteringSelector, this.consumerGroupFilteringSelector]),
filteringSelector: std.join(',', [this.topicsFilteringSelector, this.consumerGroupFilteringSelector, this.filteringSelector]),
groupLabels: this.groupLabels,
instanceLabels: ['topic', 'consumergroup'], // this.instanceLabels is ommitted, as it would point to kafka_exporter instance.
aggLevel: 'group',
Expand Down
2 changes: 1 addition & 1 deletion kafka-observ-lib/signals/topic.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ local commonlib = import 'common-lib/common/main.libsonnet';

function(this)
{
filteringSelector: std.join(',', [this.filteringSelector, this.topicsFilteringSelector]),
filteringSelector: std.join(',', [this.topicsFilteringSelector, this.filteringSelector]),
groupLabels: this.groupLabels,
instanceLabels: ['topic'], // this.instanceLabels is ommitted, as it would point to kafka_exporter instance.
aggLevel: 'group',
Expand Down
2 changes: 1 addition & 1 deletion kafka-observ-lib/signals/totalTime.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ local commonlib = import 'common-lib/common/main.libsonnet';
// TotalTimeMs metric
function(this)
{
filteringSelector: this.filteringSelector + ', quantile="%s"' % this.totalTimeMsQuantile,
filteringSelector: ('quantile="%s"' % this.totalTimeMsQuantile) + ',' + this.filteringSelector,
groupLabels: this.groupLabels,
instanceLabels: this.instanceLabels,
aggLevel: if this.totalTimeMetricsRepeat then 'instance' else 'group',
Expand Down
2 changes: 1 addition & 1 deletion kafka-observ-lib/signals/zookeeperClient.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ local commonlib = import 'common-lib/common/main.libsonnet';

function(this)
{
filteringSelector: this.filteringSelector + ', quantile="%s"' % this.zookeeperClientQuantile,
filteringSelector: ('quantile="%s"' % this.zookeeperClientQuantile) + ',' + this.filteringSelector,
groupLabels: this.groupLabels,
instanceLabels: this.instanceLabels,
aggLevel: 'instance',
Expand Down
18 changes: 12 additions & 6 deletions logs-lib/logs/variables.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ function(
labels,
)
{
// strip trailing or starting comma if present that are not accepted in LoqQL
// starting comma can be present in case of concatenation of empty filteringSelector with some extra selectors.
local _filteringSelector = std.stripChars(std.stripChars(filterSelector, ' '), ','),
local this = self,
local variablesFromLabels(labels, filterSelector) =
local chainVarProto(chainVar) =
Expand All @@ -34,7 +37,7 @@ function(
;
[
chainVarProto(chainVar)
for chainVar in utils.chainLabels(labels, [filterSelector])
for chainVar in utils.chainLabels(labels, [_filteringSelector])
],

datasource:
Expand All @@ -48,12 +51,15 @@ function(

toArray:
[self.datasource]
+ variablesFromLabels(labels, filterSelector)
+ variablesFromLabels(labels, _filteringSelector)
+ [self.regex_search],

queriesSelector:
'%s,%s' % [
filterSelector,
utils.labelsToPromQLSelector(labels),
],
std.join(
',',
std.filter(function(x) std.length(x) > 0, [
_filteringSelector,
utils.labelsToPromQLSelector(labels),
])
),
}
8 changes: 4 additions & 4 deletions openstack-mixin/panels.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -273,8 +273,8 @@ local utils = commonlib.utils;
)
+ gauge.queryOptions.withTargetsMixin(t.vCPUUsed)
+ gauge.standardOptions.withUnit('percent')
+ gauge.standardOptions.withMin('0')
+ gauge.standardOptions.withMax('150')
+ gauge.standardOptions.withMin(0)
+ gauge.standardOptions.withMax(150)
+ gauge.standardOptions.thresholds.withSteps([
gauge.standardOptions.threshold.step.withValue(0) +
gauge.standardOptions.threshold.step.withColor('green'),
Expand All @@ -288,8 +288,8 @@ local utils = commonlib.utils;
)
+ gauge.queryOptions.withTargetsMixin(t.RAMUsed)
+ gauge.standardOptions.withUnit('percent')
+ gauge.standardOptions.withMin('0')
+ gauge.standardOptions.withMax('150')
+ gauge.standardOptions.withMin(0)
+ gauge.standardOptions.withMax(150)
+ gauge.standardOptions.thresholds.withSteps([
gauge.standardOptions.threshold.step.withValue(0) +
gauge.standardOptions.threshold.step.withColor('green'),
Expand Down
2 changes: 1 addition & 1 deletion process-observ-lib/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
// 'instanceLabels' - one or more labels that can be used to identify single entity of instances. In simple cases, can be 'instance' or 'pod'.
// 'uid' - UID to prefix all dashboards original uids
local this = self,
filteringSelector: 'job!=""',
filteringSelector: '', // set to apply static filters to all queries and alerts, i.e. job="bar"
groupLabels: ['job'],
instanceLabels: ['instance'],
dashboardTags: [self.uid],
Expand Down
2 changes: 1 addition & 1 deletion windows-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
// labels to identify single windows host:
instanceLabels: ['instance'],
// selector to include in all queries(including alerts)
filteringSelector: 'job=~".*windows.*"',
filteringSelector: '',
// prefix all dashboards uids and alert groups
uid: 'windows',
// prefix dashboards titles
Expand Down
6 changes: 3 additions & 3 deletions windows-observ-lib/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
{
alert: 'WindowsCPUHighUsage',
expr: |||
100 - (avg without (mode, core) (rate(windows_cpu_time_total{%(filteringSelector)s, mode="idle"}[2m])) * 100) > %(alertsCPUThresholdWarning)s
100 - (avg without (mode, core) (rate(windows_cpu_time_total{mode="idle", %(filteringSelector)s}[2m])) * 100) > %(alertsCPUThresholdWarning)s
||| % this.config,
'for': '15m',
keep_firing_for: '5m',
Expand Down Expand Up @@ -133,7 +133,7 @@
{
alert: 'WindowsServiceNotHealthy',
expr: |||
windows_service_status{%(filteringSelector)s, status!~"starting|stopping|ok"} > 0
windows_service_status{status!~"starting|stopping|ok", %(filteringSelector)s} > 0
||| % this.config,
'for': '5m',
labels: {
Expand All @@ -150,7 +150,7 @@
{
alert: 'WindowsDiskDriveNotHealthy',
expr: |||
windows_disk_drive_status{%(filteringSelector)s, status="OK"} != 1
windows_disk_drive_status{status="OK", %(filteringSelector)s} != 1
||| % this.config,
'for': '5m',
labels: {
Expand Down
Loading

0 comments on commit 70ec4cd

Please sign in to comment.