Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/workflows/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,13 @@ jobs:
mongo_jobs=zenko/data-db-mongodb-sharded-shard.*
lifecycle_jobs=artesca-data-backbeat-lifecycle-.*-headless
github_token: ${{ steps.app-token.outputs.token }}

- name: Render and test zenko-operator alerts
uses: scality/action-prom-render-test@1.0.3
with:
alert_file_path: monitoring/zenko-operator/alerts.yaml
test_file_path: monitoring/zenko-operator/alerts.test.yaml
alert_inputs: |
namespace=zenko
job=zenko-operator-metrics
github_token: ${{ steps.app-token.outputs.token }}
1 change: 1 addition & 0 deletions monitoring/zenko-operator/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
alerts.rendered.yaml
223 changes: 223 additions & 0 deletions monitoring/zenko-operator/alerts.test.yaml
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

usually we keep the alerts along with the components which "owns" these metrics, to reduce coupling and ensure we rename the metrics and alerts/dashboards at the same time.

That would mean both alerts.yaml and building the Oras image would be in Zenko-operator; and in Zenko we would simply need to change the content of zenkoversion to reference zkop's dashboard image (along with the manifest change to enable scraping)

any reason not to do this for these?

Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
evaluation_interval: 1m
rule_files:
- alerts.rendered.yaml

tests:
# ZenkoOperatorReconcileErrors - no errors
##################################################################################################
- name: No reconcile errors
interval: 1m
input_series:
- series: zenko_operator_reconcile_errors_total{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data"}
values: 0x10
alert_rule_test:
- alertname: ZenkoOperatorReconcileErrors
eval_time: 10m
exp_alerts: []

# ZenkoOperatorReconcileErrors - errors trigger alert after for: 2m
##################################################################################################
- name: Reconcile errors trigger alert
interval: 1m
input_series:
- series: zenko_operator_reconcile_errors_total{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data"}
values: 0+1x20
alert_rule_test:
- alertname: ZenkoOperatorReconcileErrors
eval_time: 2m
exp_alerts: []
- alertname: ZenkoOperatorReconcileErrors
eval_time: 4m
exp_alerts:
- exp_labels:
severity: warning
job: zenko-operator-metrics
controller: zenko
namespace: zenko
name: artesca-data
exp_annotations:
summary: 'Zenko operator reconciliation errors detected'
description: 'The Zenko operator controller zenko is experiencing reconciliation errors for instance zenko/artesca-data.'

# ZenkoOperatorDeploymentFailed - condition never true
##################################################################################################
- name: No deployment failure
interval: 1m
input_series:
- series: zenko_operator_instance_condition{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",condition="DeploymentFailure"}
values: 0x20
alert_rule_test:
- alertname: ZenkoOperatorDeploymentFailed
eval_time: 20m
exp_alerts: []

# ZenkoOperatorDeploymentFailed - condition fires after for: 5m
##################################################################################################
- name: Deployment failure triggers alert
interval: 1m
input_series:
- series: zenko_operator_instance_condition{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",condition="DeploymentFailure"}
values: 0x3 1x20
alert_rule_test:
- alertname: ZenkoOperatorDeploymentFailed
eval_time: 7m
exp_alerts: []
- alertname: ZenkoOperatorDeploymentFailed
eval_time: 9m
exp_alerts:
- exp_labels:
severity: warning
job: zenko-operator-metrics
controller: zenko
namespace: zenko
name: artesca-data
condition: DeploymentFailure
exp_annotations:
summary: 'Zenko operator deployment failure'
description: 'Zenko instance zenko/artesca-data has a DeploymentFailure condition. The operator is unable to reconcile the instance to the desired state.'

# ZenkoOperatorSecurityOptionEnabled - skipTLSVerify never enabled
##################################################################################################
- name: No security option enabled
interval: 1m
input_series:
- series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="skipTLSVerify"}
values: 0x30
alert_rule_test:
- alertname: ZenkoOperatorSecurityOptionEnabled
eval_time: 30m
exp_alerts: []

# ZenkoOperatorSecurityOptionEnabled - skipTLSVerify fires after for: 15m
##################################################################################################
- name: skipTLSVerify triggers security alert
interval: 1m
input_series:
- series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="skipTLSVerify"}
values: 0x3 1x30
alert_rule_test:
- alertname: ZenkoOperatorSecurityOptionEnabled
eval_time: 17m
exp_alerts: []
- alertname: ZenkoOperatorSecurityOptionEnabled
eval_time: 19m
exp_alerts:
- exp_labels:
severity: warning
job: zenko-operator-metrics
controller: zenko
namespace: zenko
name: artesca-data
option: skipTLSVerify
exp_annotations:
summary: 'Security-bypassing option enabled on Zenko instance'
description: 'The option skipTLSVerify is enabled on Zenko instance zenko/artesca-data. This option bypasses security controls and must not remain active in production environments.'

# ZenkoOperatorSecurityOptionEnabled - noImplicitDeny fires after for: 15m
##################################################################################################
- name: noImplicitDeny triggers security alert
interval: 1m
input_series:
- series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="noImplicitDeny"}
values: 0x3 1x30
alert_rule_test:
- alertname: ZenkoOperatorSecurityOptionEnabled
eval_time: 19m
exp_alerts:
- exp_labels:
severity: warning
job: zenko-operator-metrics
controller: zenko
namespace: zenko
name: artesca-data
option: noImplicitDeny
exp_annotations:
summary: 'Security-bypassing option enabled on Zenko instance'
description: 'The option noImplicitDeny is enabled on Zenko instance zenko/artesca-data. This option bypasses security controls and must not remain active in production environments.'

# ZenkoOperatorTestConfigActive - cron override fires after for: 1h
##################################################################################################
- name: Cron override does not fire before 1h
interval: 1m
input_series:
- series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="lifecycleConductorCronOverride"}
values: 1x120
alert_rule_test:
- alertname: ZenkoOperatorTestConfigActive
eval_time: 59m
exp_alerts: []

- name: Cron override triggers test config alert after 1h
interval: 1m
input_series:
- series: zenko_operator_debug_option{namespace="zenko",job="zenko-operator-metrics",controller="zenko",name="artesca-data",option="lifecycleConductorCronOverride"}
values: 1x120
alert_rule_test:
- alertname: ZenkoOperatorTestConfigActive
eval_time: 61m
exp_alerts:
- exp_labels:
severity: warning
job: zenko-operator-metrics
controller: zenko
namespace: zenko
name: artesca-data
option: lifecycleConductorCronOverride
exp_annotations:
summary: 'Test/debug configuration active on Zenko instance'
description: 'The option lifecycleConductorCronOverride has been active on Zenko instance zenko/artesca-data for more than 1 hour. This option is intended for testing only and should not remain enabled in production.'

# ZenkoOperatorDRPhaseFailed - Bootstrap:Failed fires after for: 1m
##################################################################################################
- name: DR phase not failed
interval: 1m
input_series:
- series: zenko_operator_instance_phase{namespace="zenko",job="zenko-operator-metrics",controller="drsink",name="artesca-data-dr",phase="Bootstrap:Failed"}
values: 0x10
alert_rule_test:
- alertname: ZenkoOperatorDRPhaseFailed
eval_time: 10m
exp_alerts: []

- name: Bootstrap Failed triggers DR phase alert
interval: 1m
input_series:
- series: zenko_operator_instance_phase{namespace="zenko",job="zenko-operator-metrics",controller="drsink",name="artesca-data-dr",phase="Bootstrap:Failed"}
values: 0x2 1x20
alert_rule_test:
- alertname: ZenkoOperatorDRPhaseFailed
eval_time: 2m
exp_alerts: []
- alertname: ZenkoOperatorDRPhaseFailed
eval_time: 4m
exp_alerts:
- exp_labels:
severity: critical
job: zenko-operator-metrics
controller: drsink
namespace: zenko
name: artesca-data-dr
phase: Bootstrap:Failed
exp_annotations:
summary: 'DR controller entered a failed phase'
description: 'DR controller drsink for instance zenko/artesca-data-dr is in phase Bootstrap:Failed. Manual intervention is required to recover.'

- name: Failback Failed triggers DR phase alert
interval: 1m
input_series:
- series: zenko_operator_instance_phase{namespace="zenko",job="zenko-operator-metrics",controller="drsink",name="artesca-data-dr",phase="Failback:Failed"}
values: 0x2 1x20
alert_rule_test:
- alertname: ZenkoOperatorDRPhaseFailed
eval_time: 4m
exp_alerts:
- exp_labels:
severity: critical
job: zenko-operator-metrics
controller: drsink
namespace: zenko
name: artesca-data-dr
phase: Failback:Failed
exp_annotations:
summary: 'DR controller entered a failed phase'
description: 'DR controller drsink for instance zenko/artesca-data-dr is in phase Failback:Failed. Manual intervention is required to recover.'
73 changes: 73 additions & 0 deletions monitoring/zenko-operator/alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
x-inputs:
- name: namespace
type: constant
value: zenko
- name: job
type: constant
value: zenko-operator-metrics

groups:
- name: ZenkoOperator
rules:

- alert: ZenkoOperatorReconcileErrors
expr: |
rate(zenko_operator_reconcile_errors_total{namespace="${namespace}", job="${job}"}[5m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: 'Zenko operator reconciliation errors detected'
description: >-
The Zenko operator controller {{ $labels.controller }} is experiencing reconciliation
errors for instance {{ $labels.namespace }}/{{ $labels.name }}.

- alert: ZenkoOperatorDeploymentFailed
expr: |
zenko_operator_instance_condition{namespace="${namespace}", job="${job}", condition="DeploymentFailure"} == 1
for: 5m
labels:
severity: warning
annotations:
summary: 'Zenko operator deployment failure'
description: >-
Zenko instance {{ $labels.namespace }}/{{ $labels.name }} has a DeploymentFailure condition.
The operator is unable to reconcile the instance to the desired state.

- alert: ZenkoOperatorSecurityOptionEnabled
expr: |
zenko_operator_debug_option{namespace="${namespace}", job="${job}", option=~"skipTLSVerify|noImplicitDeny"} == 1
for: 15m
labels:
severity: warning
annotations:
summary: 'Security-bypassing option enabled on Zenko instance'
description: >-
The option {{ $labels.option }} is enabled on Zenko instance
{{ $labels.namespace }}/{{ $labels.name }}. This option bypasses security
controls and must not remain active in production environments.

- alert: ZenkoOperatorTestConfigActive
expr: |
zenko_operator_debug_option{namespace="${namespace}", job="${job}", option=~".*CronOverride|.*Verbose|triggerTransitions.*|triggerExpirations.*"} == 1
for: 1h
labels:
severity: warning
annotations:
summary: 'Test/debug configuration active on Zenko instance'
description: >-
The option {{ $labels.option }} has been active on Zenko instance
{{ $labels.namespace }}/{{ $labels.name }} for more than 1 hour.
This option is intended for testing only and should not remain enabled in production.

- alert: ZenkoOperatorDRPhaseFailed
expr: |
zenko_operator_instance_phase{namespace="${namespace}", job="${job}", phase=~"Bootstrap:Failed|Failback:Failed"} == 1
for: 1m
labels:
severity: critical
annotations:
summary: 'DR controller entered a failed phase'
description: >-
DR controller {{ $labels.controller }} for instance {{ $labels.namespace }}/{{ $labels.name }}
is in phase {{ $labels.phase }}. Manual intervention is required to recover.
5 changes: 5 additions & 0 deletions solution/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ spec:
image:
name: zenko-operator
tag: '$(zenko_operator_tag)'
metrics:
enabled: true
scheme: http
port: 8080
path: /metrics
EOF
}

Expand Down
Loading