From e2cb6b8cc0912ff818a7c9227dcd1f1f1aadf7e1 Mon Sep 17 00:00:00 2001 From: Michael Nairn Date: Wed, 4 Dec 2024 20:21:19 +0000 Subject: [PATCH] add alerts (single check for pod restarts) fix observability (don't remove all ServiceMonitor resources) Signed-off-by: Michael Nairn --- config/observability/kustomization.yaml | 27 ++++++++++-- .../metrics-server/kustomization.yaml | 24 ++++++++++ test/scale/README.md | 44 +++++++++++++++++-- test/scale/alerts.yaml | 3 ++ test/scale/config.yaml | 2 + test/scale/metrics.yaml | 3 ++ 6 files changed, 97 insertions(+), 6 deletions(-) create mode 100644 config/observability/metrics-server/kustomization.yaml create mode 100644 test/scale/alerts.yaml diff --git a/config/observability/kustomization.yaml b/config/observability/kustomization.yaml index f9b64ec..78811cc 100644 --- a/config/observability/kustomization.yaml +++ b/config/observability/kustomization.yaml @@ -1,16 +1,37 @@ resources: + - ./metrics-server - github.com/kuadrant/kuadrant-operator/config/observability?ref=main - ./thanos - github.com/kuadrant/kuadrant-operator/examples/dashboards?ref=main - github.com/kuadrant/kuadrant-operator/examples/alerts?ref=main patches: - - target: + - patch: | + $patch: delete + apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + name: authorino-operator-metrics + namespace: kuadrant-system + - patch: | + $patch: delete + apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor - patch: | + metadata: + name: dns-operator-metrics-monitor + namespace: kuadrant-system + - patch: | + $patch: delete + apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + name: kuadrant-operator-metrics + namespace: kuadrant-system + - patch: | $patch: delete apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: - name: ANY + name: limitador-operator-metrics + namespace: kuadrant-system - path: k8s_prometheus_patch.yaml diff --git a/config/observability/metrics-server/kustomization.yaml b/config/observability/metrics-server/kustomization.yaml new file mode 100644 index 0000000..67d5364 --- /dev/null +++ b/config/observability/metrics-server/kustomization.yaml @@ -0,0 +1,24 @@ +resources: + - https://github.com/kubernetes-sigs/metrics-server/releases/download/v0.7.1/components.yaml +patches: + - patch: |- + - op: add + path: /spec/template/spec/containers/0/args/- + value: --kubelet-insecure-tls + target: + version: v1 + kind: Deployment + name: metrics-server + namespace: kube-system + - patch: | + $patch: delete + apiVersion: apiregistration.k8s.io/v1 + kind: APIService + metadata: + name: v1beta1.metrics.k8s.io + - patch: | + $patch: delete + apiVersion: rbac.authorization.k8s.io/v1 + kind: ClusterRole + metadata: + name: system:aggregated-metrics-reader diff --git a/test/scale/README.md b/test/scale/README.md index cb79a2a..87903af 100644 --- a/test/scale/README.md +++ b/test/scale/README.md @@ -1,4 +1,6 @@ +## Setup local environment (kind) + Create a kind cluster with prometheus/thanos installed and configured ```shell make local-setup @@ -11,7 +13,7 @@ Forward port for prometheus kubectl -n monitoring port-forward service/thanos-query 9090:9090 ``` -Forward port for graphana (Optional) +Forward port for grafana (Optional) ```shell kubectl -n monitoring port-forward service/grafana 3000:3000 ``` @@ -22,7 +24,43 @@ Tail all operator logs (Optional) kubectl stern -l control-plane=dns-operator-controller-manager -A ``` -Run default scale test(1 iteration using the inmemory provider) +## Run scale test + +Export Environment variables: +```shell +#All +export PROMETHEUS_URL=http://127.0.0.1:9090 +export PROMETHEUS_TOKEN="" +#AWS +export KUADRANT_AWS_ACCESS_KEY_ID= +export KUADRANT_AWS_SECRET_ACCESS_KEY= +export KUADRANT_AWS_REGION="" +#GCP +export KUADRANT_GCP_GOOGLE_CREDENTIALS= +export KUADRANT_GCP_PROJECT_ID= +#Azure +export KUADRANT_AZURE_CREDENTIALS= +``` + +### inmemory + +```shell +make test-scale +``` +### aws + +```shell +make test-scale DNS_PROVIDER=aws KUADRANT_ZONE_ROOT_DOMAIN= +``` + +### gcp + ```shell -PROMETHEUS_URL=http://127.0.0.1:9090 PROMETHEUS_TOKEN="" make test-scale +make test-scale DNS_PROVIDER=gcp KUADRANT_ZONE_ROOT_DOMAIN= ``` + +### azure + +```shell +make test-scale DNS_PROVIDER=azure KUADRANT_ZONE_ROOT_DOMAIN= +``` \ No newline at end of file diff --git a/test/scale/alerts.yaml b/test/scale/alerts.yaml new file mode 100644 index 0000000..fdf916b --- /dev/null +++ b/test/scale/alerts.yaml @@ -0,0 +1,3 @@ +- expr: increase(kube_pod_container_status_restarts_total{container="manager", namespace=~"kuadrant-system|kuadrant-dns-operator-.*"}[5m]) > 0 + description: manager pod restarts + severity: error diff --git a/test/scale/config.yaml b/test/scale/config.yaml index 6d15eb1..08447fd 100644 --- a/test/scale/config.yaml +++ b/test/scale/config.yaml @@ -3,6 +3,8 @@ metricsEndpoints: token: {{ .PROMETHEUS_TOKEN }} metrics: - ./metrics.yaml + alerts: + - ./alerts.yaml indexer: type: local metricsDirectory: ./metrics diff --git a/test/scale/metrics.yaml b/test/scale/metrics.yaml index 8a491f9..9673883 100644 --- a/test/scale/metrics.yaml +++ b/test/scale/metrics.yaml @@ -1,2 +1,5 @@ - query: sum(rate(container_cpu_usage_seconds_total{container="",namespace=~"kuadrant-system|kuadrant-dns-operator-*|scale-test-.*"}[5m])) by(namespace) metricName: namespaceCPU + +- query: sum(rate(kube_pod_container_status_restarts_total{container="manager", namespace=~"kuadrant-system|kuadrant-dns-operator-.*"}[5m])) by(namespace) + metricName: managerPodRestarts