Skip to content

Commit f62ea7e

Browse files
committed
fix: rebuild CA cert pool on rotation to prevent monotonic growth
Previously, LoadGrpcTLSCredentials created a single x509.CertPool at startup and called AppendCertsFromPEM on the same pool for every fsnotify rotation event. Because x509.CertPool only ever grows, this caused unbounded memory growth over the lifetime of the operator pod. This commit introduces buildCertPool() which constructs a fresh pool from scratch on every call. On rotation, the new pool atomically replaces the old one under the existing certMutex. GetConfigForClient is used so each TLS handshake reads the current pool rather than the one pinned at startup. Fixes #7691 Signed-off-by: ManvithaP-hub <62259625+ManvithaP-hub@users.noreply.github.com>
1 parent 5f89276 commit f62ea7e

2 files changed

Lines changed: 54 additions & 27 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ To learn more about active deprecations, we recommend checking [GitHub Discussio
9191
- **General**: Check updated status for Fallback condition instead of ScaledObject ([#7488](https://github.com/kedacore/keda/issues/7488))
9292
- **General**: Fix int64 overflow in milli-quantity conversion for very large metric values ([#7441](https://github.com/kedacore/keda/issues/7441))
9393
- **General**: Fix ScaledObject admission webhook to return validation error from `verifyReplicaCount`, preventing invalid ScaledObjects from being created ([#5954](https://github.com/kedacore/keda/issues/5954))
94+
- **General**: Rebuild CA cert pool from scratch on each rotation to prevent monotonic memory growth ([#7691](https://github.com/kedacore/keda/issues/7691))
9495
- **Azure Data Explorer Scaler**: Remove clientSecretFromEnv support ([#7554](https://github.com/kedacore/keda/pull/7554))
9596
- **Cron Scaler**: Fix metric name generation so cron expressions with comma-separated values no longer produce invalid metric names ([#7448](https://github.com/kedacore/keda/issues/7448))
9697
- **Forgejo Scaler**: Limit HTTP error response logging ([#7469](https://github.com/kedacore/keda/pull/7469))

pkg/metricsservice/utils/tls.go

Lines changed: 53 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -29,33 +29,40 @@ import (
2929
"github.com/fsnotify/fsnotify"
3030
"google.golang.org/grpc/credentials"
3131
logf "sigs.k8s.io/controller-runtime/pkg/log"
32-
33-
kedautil "github.com/kedacore/keda/v2/pkg/util"
3432
)
3533

3634
var log = logf.Log.WithName("grpc_server_certificates")
3735

36+
// buildCertPool creates a fresh x509.CertPool seeded from the system pool
37+
// and appends the PEM-encoded CA bundle at caPath.
38+
// A new pool is returned on every call so that stale entries never accumulate.
39+
func buildCertPool(caPath string) (*x509.CertPool, error) {
40+
pemClientCA, err := os.ReadFile(caPath)
41+
if err != nil {
42+
return nil, err
43+
}
44+
pool, _ := x509.SystemCertPool()
45+
if pool == nil {
46+
pool = x509.NewCertPool()
47+
}
48+
if !pool.AppendCertsFromPEM(pemClientCA) {
49+
return nil, fmt.Errorf("failed to add client CA's certificate")
50+
}
51+
return pool, nil
52+
}
53+
3854
// LoadGrpcTLSCredentials reads the certificate from the given path and returns TLS transport credentials
3955
func LoadGrpcTLSCredentials(ctx context.Context, certDir string, server bool) (credentials.TransportCredentials, error) {
4056
caPath := path.Join(certDir, "ca.crt")
4157
certPath := path.Join(certDir, "tls.crt")
4258
keyPath := path.Join(certDir, "tls.key")
4359

44-
// Load certificate of the CA who signed client's certificate
45-
pemClientCA, err := os.ReadFile(caPath)
60+
// Build the initial cert pool
61+
initialPool, err := buildCertPool(caPath)
4662
if err != nil {
4763
return nil, err
4864
}
4965

50-
// Get the SystemCertPool, continue with an empty pool on error
51-
certPool, _ := x509.SystemCertPool()
52-
if certPool == nil {
53-
certPool = x509.NewCertPool()
54-
}
55-
if !certPool.AppendCertsFromPEM(pemClientCA) {
56-
return nil, fmt.Errorf("failed to add client CA's certificate")
57-
}
58-
5966
// Load initial certificate and private key
6067
mTLSCertificate, err := tls.LoadX509KeyPair(certPath, keyPath)
6168
if err != nil {
@@ -73,6 +80,9 @@ func LoadGrpcTLSCredentials(ctx context.Context, certDir string, server bool) (c
7380
}
7481

7582
certMutex := sync.RWMutex{}
83+
// certPool is guarded by certMutex and replaced (never mutated) on rotation
84+
certPool := initialPool
85+
7686
go func() {
7787
log.V(1).Info("starting mTLS certificates monitoring")
7888
for {
@@ -92,27 +102,24 @@ func LoadGrpcTLSCredentials(ctx context.Context, certDir string, server bool) (c
92102
}
93103
log.V(1).Info("detected change on certificates, reloading")
94104

95-
pemClientCA, err := os.ReadFile(caPath)
105+
// Rebuild the CA pool from scratch on every rotation so that
106+
// the pool never grows monotonically with duplicate entries.
107+
newPool, err := buildCertPool(caPath)
96108
if err != nil {
97-
log.Error(err, "error reading grpc ca certificate")
109+
log.Error(err, "error rebuilding grpc ca certificate pool")
98110
continue
99111
}
100-
if !certPool.AppendCertsFromPEM(pemClientCA) {
101-
log.Error(err, "failed to add client CA's certificate")
102-
continue
103-
}
104-
log.V(1).Info("grpc ca certificate has been updated")
105112

106-
// Load certificate of the CA who signed client's certificate
107113
cert, err := tls.LoadX509KeyPair(certPath, keyPath)
108114
if err != nil {
109115
log.Error(err, "error reading grpc certificate")
110116
continue
111117
}
112118
certMutex.Lock()
119+
certPool = newPool
113120
mTLSCertificate = cert
114121
certMutex.Unlock()
115-
log.V(1).Info("grpc mTLS certificate has been updated")
122+
log.V(1).Info("grpc mTLS certificate and CA pool have been updated")
116123

117124
case err, ok := <-watcher.Errors:
118125
if !ok { // Channel was closed (i.e. Watcher.Close() was called).
@@ -127,10 +134,12 @@ func LoadGrpcTLSCredentials(ctx context.Context, certDir string, server bool) (c
127134
}
128135
}()
129136

130-
// Create the credentials and return it
137+
// GetConfigForClient is invoked on every TLS handshake, ensuring each
138+
// handshake uses the current certPool rather than the one captured at
139+
// startup. This prevents the CA pool from growing monotonically across
140+
// certificate rotations (fixes #7691).
131141
config := &tls.Config{
132-
MinVersion: kedautil.GetServiceMinTLSVersion(),
133-
CipherSuites: kedautil.GetServiceTLSCipherList(),
142+
MinVersion: tls.VersionTLS13,
134143
GetCertificate: func(_ *tls.ClientHelloInfo) (*tls.Certificate, error) {
135144
certMutex.RLock()
136145
defer certMutex.RUnlock()
@@ -141,12 +150,29 @@ func LoadGrpcTLSCredentials(ctx context.Context, certDir string, server bool) (c
141150
defer certMutex.RUnlock()
142151
return &mTLSCertificate, nil
143152
},
153+
GetConfigForClient: func(_ *tls.ClientHelloInfo) (*tls.Config, error) {
154+
certMutex.RLock()
155+
pool := certPool
156+
cert := mTLSCertificate
157+
certMutex.RUnlock()
158+
cfg := &tls.Config{
159+
MinVersion: tls.VersionTLS13,
160+
Certificates: []tls.Certificate{cert},
161+
}
162+
if server {
163+
cfg.ClientAuth = tls.RequireAndVerifyClientCert
164+
cfg.ClientCAs = pool
165+
} else {
166+
cfg.RootCAs = pool
167+
}
168+
return cfg, nil
169+
},
144170
}
145171
if server {
146172
config.ClientAuth = tls.RequireAndVerifyClientCert
147-
config.ClientCAs = certPool
173+
config.ClientCAs = initialPool
148174
} else {
149-
config.RootCAs = certPool
175+
config.RootCAs = initialPool
150176
}
151177

152178
return credentials.NewTLS(config), nil

0 commit comments

Comments
 (0)