Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions api/src/controllers/monitoring.js
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,14 @@ const DEFAULT_CONNECTED_USER_INTERVAL = 7;
* count:
* type: number
* description: Number of users that exceeded the replication limit of documents.
* MonitoringReplicationFailure:
* type: object
* properties:
* count:
* type: number
* description: >
* Number of distinct users that had at least one replication failure in the current
* or previous calendar month.
* MonitoringConnectedUsers:
* type: object
* properties:
Expand Down Expand Up @@ -372,6 +380,8 @@ module.exports = {
* $ref: '#/components/schemas/MonitoringConflict'
* replication_limit:
* $ref: '#/components/schemas/MonitoringReplicationLimit'
* replication_failure:
* $ref: '#/components/schemas/MonitoringReplicationFailure'
* connected_users:
* $ref: '#/components/schemas/MonitoringConnectedUsers'
*/
Expand Down
14 changes: 13 additions & 1 deletion api/src/services/monitoring.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ const db = require('../db');
const environment = require('@medic/environment');
const logger = require('@medic/logger');
const deployInfoService = require('./deploy-info');
const replicationFailureLog = require('./replication/replication-failure-log');
const { SENTINEL_METADATA } = require('@medic/constants');

const DBS_TO_MONITOR = {
Expand Down Expand Up @@ -347,6 +348,15 @@ const getReplicationLimitLog = () => {
});
};

const getReplicationFailuresUserCount = () => {
return replicationFailureLog
.getUsersWithFailuresCount()
.catch(err => {
logger.error('Error fetching replication failures user count: %o', err);
return -1;
});
};

const getConnectedUserLogs = (connectedUserInterval) => {
const earliestTimestamp = moment().subtract(connectedUserInterval, 'days').valueOf();
return db.medicLogs
Expand Down Expand Up @@ -428,13 +438,15 @@ const jsonV2 = (connectedUserInterval) => {
jsonV1(connectedUserInterval),
getWeeklyOutgoingMessageStatusCounts(),
getLastHundredStatusUpdatesCounts(),
getReplicationFailuresUserCount(),
])
.then(([jsonV1, weeklyOutgoingMessageStatus, lastHundredCounts]) => {
.then(([jsonV1, weeklyOutgoingMessageStatus, lastHundredCounts, replicationFailuresUserCount]) => {
jsonV1.messaging.outgoing = {
total: jsonV1.messaging.outgoing.state,
seven_days: weeklyOutgoingMessageStatus,
last_hundred: lastHundredCounts,
};
jsonV1.replication_failure = { count: replicationFailuresUserCount };

return jsonV1;
});
Expand Down
21 changes: 21 additions & 0 deletions api/src/services/replication/replication-failure-log.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ const pagination = require('../pagination');
const TYPE_PREFIX = `replication-fail-`;
const MAX_FAILURES = 50;
const REPORTING_PERIOD_FORMAT = 'YYYY-MM';
const REPORTING_PERIOD_LENGTH = REPORTING_PERIOD_FORMAT.length;
Comment thread
dianabarsan marked this conversation as resolved.
Outdated
const UNKNOWN = 'unknown';
const MAX_PERIODS = 60;
const RECENT_PERIODS_FOR_USER_COUNT = 2;

const captureFailure = async (userCtx, requestId, statusCode, duration) => {
const log = await getLog(userCtx.name);
Expand Down Expand Up @@ -151,7 +153,26 @@ const get = async ({ user, reportingPeriod, cursor = 0, limit = pagination.DEFAU
return getPageByRange({ reportingPeriod, skip: cursor, limit });
};

const getRecentReportingPeriodsRange = () => {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Returning the current + previous period's data is pretty confusing to the consumer. 😓 I think I understand why you did it (it is better than just returning the current OR the previous period's data), but still....

I would like this much better if we could just a rolling sum from the last 30 days. (That kind of value will be WAY easier to track/measure in Watchdog) However, I think doing that efficiently would require a view (like we have for logs/connected_users). I know you mentioned having view PTSD 😬, but at a certain point the functionality tradeoffs are not worth it just to avoid the disk space of the view... 🤷

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, I really don't want to add views right now :D

Do you think current and past calendaristic month is a hard concept to grasp for the consumer?
Since this just means merging two periods together, since our history of failures is dated (although limited to 50). It would add a little more logic here to merge the two failure logs.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the actual returned value can change dramatically on the day the new reporting period begins.

I see your concern, which is why I am taking the last two reporting periods when returning results.

const now = moment();
const earliest = now.clone().subtract(RECENT_PERIODS_FOR_USER_COUNT - 1, 'month').format(REPORTING_PERIOD_FORMAT);
const latest = now.format(REPORTING_PERIOD_FORMAT);
return {
startkey: `${TYPE_PREFIX}${earliest}-`,
endkey: `${TYPE_PREFIX}${latest}-\ufff0`,
};
};

const usernameFromDocId = (docId) => docId.slice(TYPE_PREFIX.length + REPORTING_PERIOD_LENGTH);

const getUsersWithFailuresCount = async () => {
const result = await db.medicLogs.allDocs(getRecentReportingPeriodsRange());
const users = new Set(result.rows.map(row => usernameFromDocId(row.id)));
return users.size;
};

module.exports = {
capture: captureFailure,
get,
getUsersWithFailuresCount,
};
15 changes: 15 additions & 0 deletions api/tests/mocha/services/monitoring.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ const _ = require('lodash');
const db = require('../../../src/db');
const environment = require('@medic/environment');
const deployInfo = require('../../../src/services/deploy-info');
const replicationFailureLog = require('../../../src/services/replication/replication-failure-log');
const service = require('../../../src/services/monitoring');
const { getBundledDdocs } = require('../../../src/services/setup/utils');
const { DATABASES } = require('../../../src/services/setup/databases');
Expand Down Expand Up @@ -245,6 +246,7 @@ const setUpMocks = () => {
.resolves({ rows: [ { value: 1 } ] })
.withArgs('logs/connected_users', { startkey: 0, reduce: true })
.resolves({ rows: [ { value: 2 } ] });
sinon.stub(replicationFailureLog, 'getUsersWithFailuresCount').resolves(5);
};

const generateRows = (statusCounters) => {
Expand Down Expand Up @@ -568,6 +570,7 @@ describe('Monitoring service', () => {
chai.expect(actual.conflict).to.deep.equal({ count: 40 });
chai.expect(actual.date.current).to.equal(0);
chai.expect(actual.replication_limit.count).to.equal(1);
chai.expect(actual.replication_failure).to.deep.equal({ count: 5 });
chai.expect(actual.connected_users.count).to.equal(2);
chai.expect(request.get.args).to.deep.equalInAnyOrder([
[{ json: true, url: environment.serverUrl }],
Expand Down Expand Up @@ -603,6 +606,16 @@ describe('Monitoring service', () => {
});
});

it('v1 does not include replication_failure', () => {
setUpMocks();

return service.jsonV1().then(actual => {
chai.expect(actual).not.to.have.property('replication_failure');
// v1 must not even invoke the replication-failure service.
chai.expect(replicationFailureLog.getUsersWithFailuresCount.called).to.equal(false);
});
});

it('v1 handles errors gracefully', () => {
sinon.stub(deployInfo, 'get').rejects();
sinon.stub(request, 'get').rejects();
Expand Down Expand Up @@ -727,6 +740,7 @@ describe('Monitoring service', () => {
sinon.stub(db.sentinel, 'query').rejects();
sinon.stub(db.medicUsersMeta, 'query').rejects();
sinon.stub(db.medicLogs, 'query').rejects();
sinon.stub(replicationFailureLog, 'getUsersWithFailuresCount').rejects();

return service.jsonV2().then(actual => {
chai.expect(actual.version).to.deep.equal({
Expand Down Expand Up @@ -829,6 +843,7 @@ describe('Monitoring service', () => {
chai.expect(actual.outbound_push).to.deep.equal({ backlog: -1 });
chai.expect(actual.feedback).to.deep.equal({ count: -1 });
chai.expect(actual.replication_limit).to.deep.equal({ count: -1 });
chai.expect(actual.replication_failure).to.deep.equal({ count: -1 });
chai.expect(actual.connected_users).to.deep.equal({ count: -1 });
chai.expect(request.get.args).to.deep.equalInAnyOrder([
[{ json: true, url: environment.serverUrl }],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,78 @@ describe('Replication Failure Log Service', () => {
});
});

describe('getUsersWithFailuresCount', () => {
it('should return the number of distinct users in the current + previous calendar months', async () => {
sinon.useFakeTimers(new Date('2026-04-15T12:00:00Z').valueOf());
db.medicLogs.allDocs.resolves({
rows: [
{ id: 'replication-fail-2026-03-alice' },
{ id: 'replication-fail-2026-03-bob' },
{ id: 'replication-fail-2026-04-alice' },
{ id: 'replication-fail-2026-04-clare' },
],
});

const result = await replicationFailureLog.getUsersWithFailuresCount();

expect(db.medicLogs.allDocs.callCount).to.equal(1);
expect(db.medicLogs.allDocs.args[0][0]).to.deep.equal({
startkey: 'replication-fail-2026-03-',
endkey: 'replication-fail-2026-04-\ufff0',
});
// alice has docs in both months but is counted once.
expect(result).to.equal(3);
});

it('should return 0 when no users have failures in the window', async () => {
sinon.useFakeTimers(new Date('2026-04-15T12:00:00Z').valueOf());
db.medicLogs.allDocs.resolves({ rows: [] });

const result = await replicationFailureLog.getUsersWithFailuresCount();

expect(result).to.equal(0);
});

it('should handle usernames that contain dashes', async () => {
sinon.useFakeTimers(new Date('2026-04-15T12:00:00Z').valueOf());
db.medicLogs.allDocs.resolves({
rows: [
{ id: 'replication-fail-2026-03-sir-bob' },
{ id: 'replication-fail-2026-04-sir-bob' },
{ id: 'replication-fail-2026-04-mary-jane' },
],
});

const result = await replicationFailureLog.getUsersWithFailuresCount();

expect(result).to.equal(2);
});

it('should span across a year boundary', async () => {
sinon.useFakeTimers(new Date('2026-01-05T12:00:00Z').valueOf());
db.medicLogs.allDocs.resolves({ rows: [] });

await replicationFailureLog.getUsersWithFailuresCount();

expect(db.medicLogs.allDocs.args[0][0]).to.deep.equal({
startkey: 'replication-fail-2025-12-',
endkey: 'replication-fail-2026-01-\ufff0',
});
});

it('should propagate db errors', async () => {
sinon.useFakeTimers(new Date('2026-04-15T12:00:00Z').valueOf());
db.medicLogs.allDocs.rejects({ status: 500, message: 'db error' });

try {
await replicationFailureLog.getUsersWithFailuresCount();
expect.fail('should have thrown');
} catch (err) {
expect(err).to.deep.equal({ status: 500, message: 'db error' });
}
});
});

describe('capture', () => {
it('should create a new log when none exists', async () => {
const now = new Date('2026-04-15T12:00:00Z').valueOf();
Expand Down
32 changes: 32 additions & 0 deletions tests/integration/api/controllers/monitoring.spec.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
const utils = require('@utils');
const moment = require('moment');
const sentinelUtils = require('@utils/sentinel');

const VIEW_INDEXES_BY_DB = {
Expand Down Expand Up @@ -286,12 +287,43 @@ describe('monitoring', () => {
replication_limit: {
count: 0,
},
replication_failure: {
count: 0,
},
connected_users: {
count: 0,
},
});

assertIndeterminateFields(result);
});

it('should count distinct users with replication failures in the last 2 calendar months', async () => {
const currentPeriod = moment().format('YYYY-MM');
const previousPeriod = moment().subtract(1, 'month').format('YYYY-MM');
const olderPeriod = moment().subtract(2, 'month').format('YYYY-MM');
const failureDoc = (period, user) => ({
_id: `replication-fail-${period}-${user}`,
user,
date: Date.now(),
total_failures: 1,
failures: [{ date: Date.now(), status_code: 500, duration: 100, request_id: 'seed' }],
});
// alice appears in both windowed months and must only be counted once.
// dan is outside the 2-month window and must be excluded.
const seedDocs = [
failureDoc(currentPeriod, 'alice'),
failureDoc(currentPeriod, 'bob'),
failureDoc(previousPeriod, 'alice'),
failureDoc(previousPeriod, 'clare'),
failureDoc(olderPeriod, 'dan'),
];

await utils.logsDb.bulkDocs(seedDocs);

const result = await utils.request({ path: '/api/v2/monitoring' });

chai.expect(result.replication_failure).to.deep.equal({ count: 3 });
});
});
});
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
const utils = require('@utils');
const moment = require('moment');
const { CONTACT_TYPES } = require('@medic/constants');
const constants = require('@constants');
const userFactory = require('@factories/cht/users/users');
const placeFactory = require('@factories/cht/contacts/place');
const personFactory = require('@factories/cht/contacts/person');
Expand Down Expand Up @@ -81,22 +80,6 @@ describe('replication failure logging @docker', () => {
return response.data.find(log => log._id === currentPeriodId) || response.data[0] || null;
};

const clearLogs = async () => {
const result = await utils.logsDb.allDocs({
include_docs: true,
startkey: 'replication-fail-',
endkey: 'replication-fail-\ufff0'
});

const purgeDocs = {};
result.rows.forEach(row => purgeDocs[row.id] = [row.value.rev]);
await utils.request({
path: `/${constants.DB_NAME}-logs/_purge`,
method: 'POST',
body: purgeDocs
});
};

const requestDocsExpectingError = async (username) => {
await expect(replicationGetIds(username)).to.be.rejectedWith();
await utils.delayPromise(SETTLE_DELAY_MS);
Expand All @@ -113,7 +96,7 @@ describe('replication failure logging @docker', () => {
});

afterEach(async () => {
await clearLogs();
await utils.clearReplicationFailureLogs();
});

describe('on successful replication', () => {
Expand Down
14 changes: 14 additions & 0 deletions tests/utils/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -763,10 +763,23 @@ const revertDb = async (except = [], ignoreRefresh = true) => { //NOSONAR

await deleteMetaDbs();
await deleteCredentials();
await clearReplicationFailureLogs();

await setUserContactDoc();
};

const clearReplicationFailureLogs = async () => {
const result = await logsDb.allDocs({
startkey: 'replication-fail-',
endkey: 'replication-fail-\ufff0',
});
if (!result.rows.length) {
return;
}
const docs = result.rows.map(row => ({ _id: row.id, _rev: row.value.rev, _deleted: true }));
await logsDb.bulkDocs(docs);
};

const getOrigin = () => `${constants.BASE_URL}`;

const getBaseUrl = () => `${constants.BASE_URL}/#/`;
Expand Down Expand Up @@ -1797,6 +1810,7 @@ module.exports = {
updateSettings,
revertSettings,
revertDb,
clearReplicationFailureLogs,
getOrigin,
getBaseUrl,
getAdminBaseUrl,
Expand Down
Loading