diff --git a/api/src/controllers/monitoring.js b/api/src/controllers/monitoring.js index daf5ae3d378..cb0c8faaebb 100644 --- a/api/src/controllers/monitoring.js +++ b/api/src/controllers/monitoring.js @@ -121,6 +121,16 @@ const DEFAULT_CONNECTED_USER_INTERVAL = 7; * count: * type: number * description: Number of users that exceeded the replication limit of documents. + * MonitoringReplicationFailure: + * type: object + * properties: + * count: + * type: number + * description: > + * Number of distinct users that had at least one replication failure in the + * reporting window. The window defaults to 7 days but can be changed via the + * `connected_user_interval` query parameter (the same parameter that drives + * `connected_users.count`). * MonitoringConnectedUsers: * type: object * properties: @@ -243,7 +253,9 @@ module.exports = { * schema: * type: number * default: 7 - * description: The number of days to use when counting connected users + * description: > + * The number of days to use when counting connected users and users with replication + * failures. * responses: * '200': * description: Monitoring metrics @@ -372,6 +384,8 @@ module.exports = { * $ref: '#/components/schemas/MonitoringConflict' * replication_limit: * $ref: '#/components/schemas/MonitoringReplicationLimit' + * replication_failure: + * $ref: '#/components/schemas/MonitoringReplicationFailure' * connected_users: * $ref: '#/components/schemas/MonitoringConnectedUsers' */ diff --git a/api/src/services/monitoring.js b/api/src/services/monitoring.js index 6453d54e12f..78dccc50917 100644 --- a/api/src/services/monitoring.js +++ b/api/src/services/monitoring.js @@ -5,6 +5,7 @@ const db = require('../db'); const environment = require('@medic/environment'); const logger = require('@medic/logger'); const deployInfoService = require('./deploy-info'); +const replicationFailureLog = require('./replication/replication-failure-log'); const { SENTINEL_METADATA } = require('@medic/constants'); const DBS_TO_MONITOR = { @@ -347,6 +348,15 @@ const getReplicationLimitLog = () => { }); }; +const getReplicationFailuresUserCount = (intervalDays) => { + return replicationFailureLog + .getUsersWithFailuresCount(intervalDays) + .catch(err => { + logger.error('Error fetching replication failures user count: %o', err); + return -1; + }); +}; + const getConnectedUserLogs = (connectedUserInterval) => { const earliestTimestamp = moment().subtract(connectedUserInterval, 'days').valueOf(); return db.medicLogs @@ -428,13 +438,15 @@ const jsonV2 = (connectedUserInterval) => { jsonV1(connectedUserInterval), getWeeklyOutgoingMessageStatusCounts(), getLastHundredStatusUpdatesCounts(), + getReplicationFailuresUserCount(connectedUserInterval), ]) - .then(([jsonV1, weeklyOutgoingMessageStatus, lastHundredCounts]) => { + .then(([jsonV1, weeklyOutgoingMessageStatus, lastHundredCounts, replicationFailuresUserCount]) => { jsonV1.messaging.outgoing = { total: jsonV1.messaging.outgoing.state, seven_days: weeklyOutgoingMessageStatus, last_hundred: lastHundredCounts, }; + jsonV1.replication_failure = { count: replicationFailuresUserCount }; return jsonV1; }); diff --git a/api/src/services/replication/replication-failure-log.js b/api/src/services/replication/replication-failure-log.js index 9f9b0d2a411..cd95c2917b2 100644 --- a/api/src/services/replication/replication-failure-log.js +++ b/api/src/services/replication/replication-failure-log.js @@ -5,17 +5,19 @@ const pagination = require('../pagination'); const TYPE_PREFIX = `replication-fail-`; const MAX_FAILURES = 50; const REPORTING_PERIOD_FORMAT = 'YYYY-MM'; +const DAILY_COUNT_KEY_FORMAT = 'YYYY-MM-DD'; const UNKNOWN = 'unknown'; const MAX_PERIODS = 60; const captureFailure = async (userCtx, requestId, statusCode, duration) => { const log = await getLog(userCtx.name); + const now = moment(); // Counts are only set on userCtx as each phase of the request completes. When a count is missing // we record 'unknown' instead of omitting the key, so a stable shape on the failure entry tells you // (by which counters are 'unknown') how far the request progressed before failing. const failure = { - date: moment().valueOf(), + date: now.valueOf(), status_code: statusCode, duration: duration, request_id: requestId, @@ -31,6 +33,10 @@ const captureFailure = async (userCtx, requestId, statusCode, duration) => { log.failures = log.failures.slice(-MAX_FAILURES); } + const dayKey = now.format(DAILY_COUNT_KEY_FORMAT); + log.daily_counts = log.daily_counts || {}; + log.daily_counts[dayKey] = (log.daily_counts[dayKey] || 0) + 1; + return db.medicLogs.put(log); }; @@ -54,6 +60,7 @@ const getLog = async (userName) => { date: moment().valueOf(), total_failures: 0, failures: [], + daily_counts: {}, }; } throw err; @@ -151,7 +158,14 @@ const get = async ({ user, reportingPeriod, cursor = 0, limit = pagination.DEFAU return getPageByRange({ reportingPeriod, skip: cursor, limit }); }; +const getUsersWithFailuresCount = async (intervalDays) => { + const sinceKey = moment().subtract(intervalDays, 'days').format(DAILY_COUNT_KEY_FORMAT); + const result = await db.medicLogs.query('logs/replication_failures', { startkey: [sinceKey] }); + return new Set(result.rows.map(row => row.key[1])).size; +}; + module.exports = { capture: captureFailure, get, + getUsersWithFailuresCount, }; diff --git a/api/tests/mocha/services/monitoring.spec.js b/api/tests/mocha/services/monitoring.spec.js index 8422691743d..a42f5ce051a 100644 --- a/api/tests/mocha/services/monitoring.spec.js +++ b/api/tests/mocha/services/monitoring.spec.js @@ -6,6 +6,7 @@ const _ = require('lodash'); const db = require('../../../src/db'); const environment = require('@medic/environment'); const deployInfo = require('../../../src/services/deploy-info'); +const replicationFailureLog = require('../../../src/services/replication/replication-failure-log'); const service = require('../../../src/services/monitoring'); const { getBundledDdocs } = require('../../../src/services/setup/utils'); const { DATABASES } = require('../../../src/services/setup/databases'); @@ -245,6 +246,7 @@ const setUpMocks = () => { .resolves({ rows: [ { value: 1 } ] }) .withArgs('logs/connected_users', { startkey: 0, reduce: true }) .resolves({ rows: [ { value: 2 } ] }); + sinon.stub(replicationFailureLog, 'getUsersWithFailuresCount').resolves(5); }; const generateRows = (statusCounters) => { @@ -568,6 +570,7 @@ describe('Monitoring service', () => { chai.expect(actual.conflict).to.deep.equal({ count: 40 }); chai.expect(actual.date.current).to.equal(0); chai.expect(actual.replication_limit.count).to.equal(1); + chai.expect(actual.replication_failure).to.deep.equal({ count: 5 }); chai.expect(actual.connected_users.count).to.equal(2); chai.expect(request.get.args).to.deep.equalInAnyOrder([ [{ json: true, url: environment.serverUrl }], @@ -603,6 +606,16 @@ describe('Monitoring service', () => { }); }); + it('v1 does not include replication_failure', () => { + setUpMocks(); + + return service.jsonV1().then(actual => { + chai.expect(actual).not.to.have.property('replication_failure'); + // v1 must not even invoke the replication-failure service. + chai.expect(replicationFailureLog.getUsersWithFailuresCount.called).to.equal(false); + }); + }); + it('v1 handles errors gracefully', () => { sinon.stub(deployInfo, 'get').rejects(); sinon.stub(request, 'get').rejects(); @@ -727,6 +740,7 @@ describe('Monitoring service', () => { sinon.stub(db.sentinel, 'query').rejects(); sinon.stub(db.medicUsersMeta, 'query').rejects(); sinon.stub(db.medicLogs, 'query').rejects(); + sinon.stub(replicationFailureLog, 'getUsersWithFailuresCount').rejects(); return service.jsonV2().then(actual => { chai.expect(actual.version).to.deep.equal({ @@ -829,6 +843,7 @@ describe('Monitoring service', () => { chai.expect(actual.outbound_push).to.deep.equal({ backlog: -1 }); chai.expect(actual.feedback).to.deep.equal({ count: -1 }); chai.expect(actual.replication_limit).to.deep.equal({ count: -1 }); + chai.expect(actual.replication_failure).to.deep.equal({ count: -1 }); chai.expect(actual.connected_users).to.deep.equal({ count: -1 }); chai.expect(request.get.args).to.deep.equalInAnyOrder([ [{ json: true, url: environment.serverUrl }], diff --git a/api/tests/mocha/services/replication/replication-failure-log.spec.js b/api/tests/mocha/services/replication/replication-failure-log.spec.js index 7d612c0e32c..d03973570c8 100644 --- a/api/tests/mocha/services/replication/replication-failure-log.spec.js +++ b/api/tests/mocha/services/replication/replication-failure-log.spec.js @@ -9,6 +9,7 @@ describe('Replication Failure Log Service', () => { sinon.stub(db.medicLogs, 'allDocs'); sinon.stub(db.medicLogs, 'get'); sinon.stub(db.medicLogs, 'put'); + sinon.stub(db.medicLogs, 'query'); }); afterEach(() => { @@ -265,6 +266,72 @@ describe('Replication Failure Log Service', () => { }); }); + describe('getUsersWithFailuresCount', () => { + const row = (day, user, count) => ({ key: [day, user], value: count }); + + it('should query the view with a windowed startkey and return distinct users', async () => { + // Today: 2026-04-15. Interval: 7 days. Since: 2026-04-08. + sinon.useFakeTimers(new Date('2026-04-15T12:00:00Z').valueOf()); + db.medicLogs.query.resolves({ + rows: [ + row('2026-04-10', 'alice', 1), + row('2026-04-12', 'alice', 3), + row('2026-04-14', 'bob', 2), + row('2026-04-09', 'clare', 1), + ], + }); + + const result = await replicationFailureLog.getUsersWithFailuresCount(7); + + expect(db.medicLogs.query.callCount).to.equal(1); + expect(db.medicLogs.query.args[0]).to.deep.equal([ + 'logs/replication_failures', + { startkey: ['2026-04-08'] }, + ]); + // alice appears twice but is counted once. + expect(result).to.equal(3); + }); + + it('should honour the interval argument', async () => { + sinon.useFakeTimers(new Date('2026-04-15T12:00:00Z').valueOf()); + db.medicLogs.query.resolves({ rows: [] }); + + await replicationFailureLog.getUsersWithFailuresCount(30); + + expect(db.medicLogs.query.args[0][1]).to.deep.equal({ startkey: ['2026-03-16'] }); + }); + + it('should return 0 when the view returns no rows', async () => { + sinon.useFakeTimers(new Date('2026-04-15T12:00:00Z').valueOf()); + db.medicLogs.query.resolves({ rows: [] }); + + const result = await replicationFailureLog.getUsersWithFailuresCount(7); + + expect(result).to.equal(0); + }); + + it('should span across a year boundary', async () => { + sinon.useFakeTimers(new Date('2026-01-05T12:00:00Z').valueOf()); + db.medicLogs.query.resolves({ rows: [] }); + + await replicationFailureLog.getUsersWithFailuresCount(30); + + expect(db.medicLogs.query.args[0][1]).to.deep.equal({ startkey: ['2025-12-06'] }); + }); + + it('should propagate db errors', async () => { + sinon.useFakeTimers(new Date('2026-04-15T12:00:00Z').valueOf()); + db.medicLogs.query.rejects({ status: 500, message: 'db error' }); + + try { + await replicationFailureLog.getUsersWithFailuresCount(7); + expect.fail('should have thrown'); + } catch (err) { + expect(err).to.deep.equal({ status: 500, message: 'db error' }); + } + }); + }); + describe('capture', () => { it('should create a new log when none exists', async () => { const now = new Date('2026-04-15T12:00:00Z').valueOf(); @@ -298,6 +365,7 @@ describe('Replication Failure Log Service', () => { unpurged_docs_count: 1200, roles: ['chw'], }], + daily_counts: { '2026-04-15': 1 }, }); }); @@ -470,6 +538,64 @@ describe('Replication Failure Log Service', () => { unpurged_docs_count: 'unknown', roles: ['chw'], }], + daily_counts: { '2026-04-15': 1 }, + }); + }); + + describe('daily_counts', () => { + it('should increment the bucket for the current day when a doc exists with no buckets', async () => { + sinon.useFakeTimers(new Date('2026-04-15T12:00:00Z').valueOf()); + // Failure-log docs predating the daily_counts rollout lack the field — we add it lazily. + db.medicLogs.get.resolves({ + _id: 'replication-fail-2026-04-bob', + _rev: '1-abc', + user: 'bob', + total_failures: 5, + failures: [], + }); + db.medicLogs.put.resolves(); + + await replicationFailureLog.capture({ name: 'bob', roles: ['chw'] }, 'req', 500, 100); + + expect(db.medicLogs.put.args[0][0].daily_counts).to.deep.equal({ '2026-04-15': 1 }); + }); + + it('should increment an existing same-day bucket', async () => { + sinon.useFakeTimers(new Date('2026-04-15T12:00:00Z').valueOf()); + db.medicLogs.get.resolves({ + _id: 'replication-fail-2026-04-bob', + _rev: '1-abc', + user: 'bob', + total_failures: 7, + failures: [], + daily_counts: { '2026-04-15': 7 }, + }); + db.medicLogs.put.resolves(); + + await replicationFailureLog.capture({ name: 'bob', roles: ['chw'] }, 'req', 500, 100); + + expect(db.medicLogs.put.args[0][0].daily_counts).to.deep.equal({ '2026-04-15': 8 }); + }); + + it('should add a new bucket on a new day while preserving prior days', async () => { + sinon.useFakeTimers(new Date('2026-04-15T12:00:00Z').valueOf()); + db.medicLogs.get.resolves({ + _id: 'replication-fail-2026-04-bob', + _rev: '1-abc', + user: 'bob', + total_failures: 4, + failures: [], + daily_counts: { '2026-04-13': 2, '2026-04-14': 2 }, + }); + db.medicLogs.put.resolves(); + + await replicationFailureLog.capture({ name: 'bob', roles: ['chw'] }, 'req', 500, 100); + + expect(db.medicLogs.put.args[0][0].daily_counts).to.deep.equal({ + '2026-04-13': 2, + '2026-04-14': 2, + '2026-04-15': 1, + }); }); }); }); diff --git a/ddocs/logs-db/logs/views/replication_failures/map.js b/ddocs/logs-db/logs/views/replication_failures/map.js new file mode 100644 index 00000000000..d4932cff98d --- /dev/null +++ b/ddocs/logs-db/logs/views/replication_failures/map.js @@ -0,0 +1,7 @@ +function(doc) { + if (doc._id.indexOf('replication-fail-') === 0 && doc.daily_counts && doc.user) { + Object.keys(doc.daily_counts).forEach(function(day) { + emit([day, doc.user], doc.daily_counts[day]); + }); + } +} diff --git a/tests/integration/api/controllers/monitoring.spec.js b/tests/integration/api/controllers/monitoring.spec.js index 9c39834fd0c..637d96a3359 100644 --- a/tests/integration/api/controllers/monitoring.spec.js +++ b/tests/integration/api/controllers/monitoring.spec.js @@ -1,4 +1,5 @@ const utils = require('@utils'); +const moment = require('moment'); const sentinelUtils = require('@utils/sentinel'); const VIEW_INDEXES_BY_DB = { @@ -286,6 +287,9 @@ describe('monitoring', () => { replication_limit: { count: 0, }, + replication_failure: { + count: 0, + }, connected_users: { count: 0, }, @@ -293,5 +297,38 @@ describe('monitoring', () => { assertIndeterminateFields(result); }); + + it('should count distinct users with replication failures in the configured window', async () => { + const today = moment(); + const recent = today.clone().subtract(2, 'days'); + const onBoundary = today.clone().subtract(7, 'days'); + const tooOld = today.clone().subtract(14, 'days'); + const failureDoc = (period, user, daily_counts) => ({ + _id: `replication-fail-${period}-${user}`, + user, + date: today.valueOf(), + total_failures: Object.values(daily_counts).reduce((a, b) => a + b, 0), + failures: [], + daily_counts, + }); + const dayKey = (m) => m.format('YYYY-MM-DD'); + const periodKey = (m) => m.format('YYYY-MM'); + + const seedDocs = [ + failureDoc(periodKey(today), 'alice', { [dayKey(recent)]: 3 }), + failureDoc(periodKey(onBoundary), 'alice', { [dayKey(onBoundary)]: 2 }), + failureDoc(periodKey(recent), 'bob', { [dayKey(recent)]: 1 }), + failureDoc(periodKey(onBoundary), 'clare', { [dayKey(onBoundary)]: 2 }), + failureDoc(periodKey(tooOld), 'dan', { [dayKey(tooOld)]: 99 }), + ]; + + await utils.logsDb.bulkDocs(seedDocs); + + const defaultWindow = await utils.request({ path: '/api/v2/monitoring' }); + chai.expect(defaultWindow.replication_failure).to.deep.equal({ count: 3 }); + + const widerWindow = await utils.request({ path: '/api/v2/monitoring?connected_user_interval=30' }); + chai.expect(widerWindow.replication_failure).to.deep.equal({ count: 4 }); + }); }); }); diff --git a/tests/integration/api/controllers/replication-failure-log.spec.js b/tests/integration/api/controllers/replication-failure-log.spec.js index b4f70fe3cad..8e1af11748d 100644 --- a/tests/integration/api/controllers/replication-failure-log.spec.js +++ b/tests/integration/api/controllers/replication-failure-log.spec.js @@ -1,7 +1,6 @@ const utils = require('@utils'); const moment = require('moment'); const { CONTACT_TYPES } = require('@medic/constants'); -const constants = require('@constants'); const userFactory = require('@factories/cht/users/users'); const placeFactory = require('@factories/cht/contacts/place'); const personFactory = require('@factories/cht/contacts/person'); @@ -81,22 +80,6 @@ describe('replication failure logging @docker', () => { return response.data.find(log => log._id === currentPeriodId) || response.data[0] || null; }; - const clearLogs = async () => { - const result = await utils.logsDb.allDocs({ - include_docs: true, - startkey: 'replication-fail-', - endkey: 'replication-fail-\ufff0' - }); - - const purgeDocs = {}; - result.rows.forEach(row => purgeDocs[row.id] = [row.value.rev]); - await utils.request({ - path: `/${constants.DB_NAME}-logs/_purge`, - method: 'POST', - body: purgeDocs - }); - }; - const requestDocsExpectingError = async (username) => { await expect(replicationGetIds(username)).to.be.rejectedWith(); await utils.delayPromise(SETTLE_DELAY_MS); @@ -113,7 +96,7 @@ describe('replication failure logging @docker', () => { }); afterEach(async () => { - await clearLogs(); + await utils.clearReplicationFailureLogs(); }); describe('on successful replication', () => { @@ -179,6 +162,46 @@ describe('replication failure logging @docker', () => { }); }); + it('should initialise daily_counts with today\'s bucket on a fresh failure', async () => { + await requestDocsExpectingError('mathil'); + + const log = await getUserFailureLog('mathil'); + const today = moment().format('YYYY-MM-DD'); + expect(log.daily_counts).to.deep.equal({ [today]: 1 }); + }); + + it('should increment the same-day bucket across multiple failures', async () => { + await requestDocsExpectingError('mathil'); + await requestDocsExpectingError('mathil'); + await requestDocsExpectingError('mathil'); + + const log = await getUserFailureLog('mathil'); + const today = moment().format('YYYY-MM-DD'); + expect(log.daily_counts).to.deep.equal({ [today]: 3 }); + expect(log.total_failures).to.equal(3); + }); + + it('should preserve buckets from previous days when capturing a new failure', async () => { + // Seed a real log doc with historical buckets so we can verify the merge, not just the create. + await requestDocsExpectingError('mathil'); + const logId = getFailureLogId('mathil'); + const seeded = await utils.logsDb.get(logId); + const yesterday = moment().subtract(1, 'day').format('YYYY-MM-DD'); + const twoDaysAgo = moment().subtract(2, 'days').format('YYYY-MM-DD'); + seeded.daily_counts = { [twoDaysAgo]: 4, [yesterday]: 2 }; + await utils.logsDb.put(seeded); + + await requestDocsExpectingError('mathil'); + + const log = await getUserFailureLog('mathil'); + const today = moment().format('YYYY-MM-DD'); + expect(log.daily_counts).to.deep.equal({ + [twoDaysAgo]: 4, + [yesterday]: 2, + [today]: 1, + }); + }); + it('should cap stored failures at 50 and track total count', async () => { // Create an initial failure to get a real log doc await requestDocsExpectingError('mathil'); diff --git a/tests/utils/index.js b/tests/utils/index.js index 4beb3948aa0..89f29e9e665 100644 --- a/tests/utils/index.js +++ b/tests/utils/index.js @@ -779,10 +779,23 @@ const revertDb = async (except = [], ignoreRefresh = true) => { //NOSONAR await deleteMetaDbs(); await deleteCredentials(); + await clearReplicationFailureLogs(); await setUserContactDoc(); }; +const clearReplicationFailureLogs = async () => { + const result = await logsDb.allDocs({ + startkey: 'replication-fail-', + endkey: 'replication-fail-\ufff0', + }); + if (!result.rows.length) { + return; + } + const docs = result.rows.map(row => ({ _id: row.id, _rev: row.value.rev, _deleted: true })); + await logsDb.bulkDocs(docs); +}; + const getOrigin = () => `${constants.BASE_URL}`; const getBaseUrl = () => `${constants.BASE_URL}/#/`; @@ -1813,6 +1826,7 @@ module.exports = { updateSettings, revertSettings, revertDb, + clearReplicationFailureLogs, getOrigin, getBaseUrl, getAdminBaseUrl,