diff --git a/PUBLISHING.md b/PUBLISHING.md index 4583db35c..8a4ecb499 100644 --- a/PUBLISHING.md +++ b/PUBLISHING.md @@ -111,6 +111,7 @@ See the full list of Microsoft-controlled scopes: `@microsoft`, `@azure`, | AgentMesh Copilot Governance | `@microsoft/agentmesh-copilot-governance` | `packages/agentmesh-integrations/copilot-governance` | | AgentMesh Mastra | `@microsoft/agentmesh-mastra` | `packages/agentmesh-integrations/mastra-agentmesh` | | AgentMesh API | `@microsoft/agentmesh-api` | `packages/agent-mesh/services/api` | +| AgentMesh MCP Governance | `@microsoft/agentmesh-mcp-governance` | `packages/agent-mesh/packages/mcp-governance` | | AgentMesh MCP Proxy | `@microsoft/agentmesh-mcp-proxy` | `packages/agent-mesh/packages/mcp-proxy` | | AgentMesh SDK | `@microsoft/agentmesh-sdk` | `packages/agent-mesh/sdks/typescript` | | Agent OS Copilot Extension | `@microsoft/agent-os-copilot-extension` | `packages/agent-os/extensions/copilot` | diff --git a/packages/agent-mesh/sdks/typescript/README.md b/packages/agent-mesh/sdks/typescript/README.md index 216d0c4a9..8e347aaf3 100644 --- a/packages/agent-mesh/sdks/typescript/README.md +++ b/packages/agent-mesh/sdks/typescript/README.md @@ -11,13 +11,19 @@ Provides agent identity (Ed25519 DIDs), trust scoring, policy evaluation, hash-c ## Installation ```bash -npm install @agentmesh/sdk +npm install @microsoft/agentmesh-sdk +``` + +For MCP-only workloads, install the standalone governance package instead: + +```bash +npm install @microsoft/agentmesh-mcp-governance ``` ## Quick Start ```typescript -import { AgentMeshClient } from '@agentmesh/sdk'; +import { AgentMeshClient } from '@microsoft/agentmesh-sdk'; const client = AgentMeshClient.create('my-agent', { capabilities: ['data.read', 'data.write'], @@ -44,7 +50,7 @@ console.log(client.audit.verify()); // true Manage agent identities built on Ed25519 key pairs. ```typescript -import { AgentIdentity } from '@agentmesh/sdk'; +import { AgentIdentity } from '@microsoft/agentmesh-sdk'; const identity = AgentIdentity.generate('agent-1', ['read']); const signature = identity.sign(new TextEncoder().encode('hello')); @@ -60,7 +66,7 @@ const restored = AgentIdentity.fromJSON(json); Track and score trust for peer agents. ```typescript -import { TrustManager } from '@agentmesh/sdk'; +import { TrustManager } from '@microsoft/agentmesh-sdk'; const tm = new TrustManager({ initialScore: 0.5, decayFactor: 0.95 }); @@ -76,7 +82,7 @@ const score = tm.getTrustScore('peer-1'); Rule-based policy evaluation with conditions and YAML support. ```typescript -import { PolicyEngine } from '@agentmesh/sdk'; +import { PolicyEngine } from '@microsoft/agentmesh-sdk'; const engine = new PolicyEngine([ { action: 'data.*', effect: 'allow' }, @@ -96,7 +102,7 @@ await engine.loadFromYAML('./policy.yaml'); Append-only audit log with hash-chain integrity verification. ```typescript -import { AuditLogger } from '@agentmesh/sdk'; +import { AuditLogger } from '@microsoft/agentmesh-sdk'; const logger = new AuditLogger(); @@ -113,7 +119,7 @@ logger.exportJSON(); // full log as JSON string Unified client tying identity, trust, policy, and audit together. ```typescript -import { AgentMeshClient } from '@agentmesh/sdk'; +import { AgentMeshClient } from '@microsoft/agentmesh-sdk'; const client = AgentMeshClient.create('my-agent', { policyRules: [{ action: 'data.*', effect: 'allow' }], @@ -123,6 +129,125 @@ const result = await client.executeWithGovernance('data.read', { user: 'alice' } // result: { decision, trustScore, auditEntry, executionTime } ``` +### MCP Security + +Use the MCP security primitives to govern both tool definitions and runtime traffic. +You can access the same governance surface either from the full SDK or from the standalone MCP package. + +#### Full SDK install + +```typescript +import { + ApprovalStatus, + CredentialRedactor, + MCPGateway, + MCPMessageSigner, + MCPResponseScanner, + MCPSecurityScanner, + MCPSessionAuthenticator, + MCPSlidingRateLimiter, +} from '@microsoft/agentmesh-sdk'; +``` + +#### Standalone MCP governance install + +```typescript +import { + ApprovalStatus, + CredentialRedactor, + MCPGateway, + MCPMessageSigner, + MCPResponseScanner, + MCPSecurityScanner, + MCPSessionAuthenticator, + MCPSlidingRateLimiter, +} from '@microsoft/agentmesh-mcp-governance'; +``` + +Both entry points expose the same MCP governance primitives; the standalone package has zero dependency on the rest of the AGT SDK. + +```typescript +import { + ApprovalStatus, + CredentialRedactor, + MCPGateway, + MCPMessageSigner, + MCPResponseScanner, + MCPSecurityScanner, + MCPSessionAuthenticator, + MCPSlidingRateLimiter, +} from '@microsoft/agentmesh-sdk'; + +const responseScanner = new MCPResponseScanner(); +const redactor = new CredentialRedactor(); +const sessionAuth = new MCPSessionAuthenticator({ + secret: process.env.MCP_SESSION_SECRET!, +}); +const messageSigner = new MCPMessageSigner({ + secret: process.env.MCP_SIGNING_SECRET!, +}); +const rateLimiter = new MCPSlidingRateLimiter({ + maxRequests: 60, + windowMs: 60_000, +}); +const securityScanner = new MCPSecurityScanner(); + +const gateway = new MCPGateway({ + allowedTools: ['read_file', 'search_docs'], + sensitiveTools: ['deploy'], + rateLimiter, + approvalHandler: async ({ toolName }) => + toolName === 'deploy' + ? ApprovalStatus.Approved + : ApprovalStatus.Pending, +}); + +const toolDecision = await gateway.evaluateToolCall('agent-1', 'read_file', { + path: '/workspace/README.md', +}); +const issuedSession = await sessionAuth.issueToken('agent-1'); +const verifiedSession = await sessionAuth.verifyToken( + issuedSession.token, + 'agent-1', +); +const signedMessage = messageSigner.sign({ + tool: 'read_file', + args: { path: '/workspace/README.md' }, +}); +const verifiedMessage = await messageSigner.verify(signedMessage); +const toolThreats = securityScanner.scanTool( + 'read_file', + 'Read the contents of a file at the specified path.', + { + type: 'object', + properties: { path: { type: 'string' } }, + required: ['path'], + additionalProperties: false, + }, + 'filesystem-server', +); +const scannedResponse = responseScanner.scan({ + text: 'Search completed successfully.', +}); +const redactedSecrets = redactor.redact({ + bearerToken: 'Bearer abcdefghijklmnop', +}); +``` + +The MCP surface adds: + +- **MCPResponseScanner** — strips and flags prompt-injection tags, imperative phrasing, credential leaks, and exfiltration URLs before tool output reaches an LLM +- **MCPSessionAuthenticator** — HMAC-backed session tokens bound to agent identity with TTL expiry and concurrent-session enforcement +- **MCPMessageSigner** — HMAC-SHA256 request signing with timestamps and nonce replay protection +- **CredentialRedactor** — secret redaction for strings and nested object graphs +- **MCPSlidingRateLimiter** — per-agent sliding-window rate limiting +- **MCPSecurityScanner** — tool metadata scanning for poisoning, rug pulls, cross-server attacks, description injection, and schema abuse +- **MCPGateway** — deny-list, allow-list, sanitization, rate limiting, and approval orchestration + +> [!NOTE] +> The built-in nonce and session stores are in-memory and intended for single-process development or tests. +> In multi-replica or enterprise deployments, implement the provided store interfaces against durable shared storage and inject shared clock/nonce providers for deterministic behavior. + ## Development ```bash diff --git a/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/README.md b/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/README.md new file mode 100644 index 000000000..5c016e3d1 --- /dev/null +++ b/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/README.md @@ -0,0 +1,99 @@ +# MCP Express Server Example + +This example shows a minimal Express.js server running the full AgentMesh MCP governance pipeline for a `POST /call-tool` endpoint. + +## What it demonstrates + +- `MCPGateway` allow-list, sanitization, rate limiting, and approval flow +- `MCPMessageSigner` signing and verification of tool calls +- `MCPSessionAuthenticator` session tokens with TTL-bound agent identity +- `MCPSlidingRateLimiter` per-agent request throttling +- `MCPSecurityScanner` request inspection for prompt-injection style content +- `MCPResponseScanner` output scanning for credentials and exfiltration patterns +- `CredentialRedactor` redaction before logging +- `GET /health` for readiness plus a short-lived demo session token + +## Prerequisites + +- Node 18+ +- npm + +## Install and run + +```bash +cd packages/agent-mesh/sdks/typescript/examples/mcp-express-server +npm install +npx tsx src/server.ts +``` + +The example runs against the checked-out SDK source in this repository so reviewers can exercise the current branch without publishing a package first. + +## Endpoints + +- `GET /health` - readiness plus a demo session token for `demo-agent` +- `POST /call-tool` - signs, verifies, authenticates, rate-limits, scans, redacts, and executes a tool call + +## Example curl flows + +Fetch a demo session token: + +```bash +curl http://127.0.0.1:3000/health +``` + +Use the returned `demoSessionToken` in a governed tool call: + +```bash +curl -X POST http://127.0.0.1:3000/call-tool \ + -H "content-type: application/json" \ + -H "x-session-token: " \ + -d '{ + "agentId": "demo-agent", + "toolName": "search_docs", + "args": { "query": "OWASP MCP" } + }' +``` + +Trigger the path-traversal guard: + +```bash +curl -X POST http://127.0.0.1:3000/call-tool \ + -H "content-type: application/json" \ + -H "x-session-token: " \ + -d '{ + "agentId": "demo-agent", + "toolName": "read_file", + "args": { "path": "../secrets.txt", "approved": true } + }' +``` + +Trigger response scanning for leaked credentials: + +```bash +curl -X POST http://127.0.0.1:3000/call-tool \ + -H "content-type: application/json" \ + -H "x-session-token: " \ + -d '{ + "agentId": "demo-agent", + "toolName": "read_file", + "args": { "path": "workspace/secrets.txt", "approved": true } + }' +``` + +## OWASP MCP mapping + +| Primitive | Example role | +| --- | --- | +| `MCPSessionAuthenticator` | Session binding and expiry | +| `MCPMessageSigner` | Signed tool-call envelopes | +| `MCPGateway` | Deny/allow/sanitize/approve pipeline | +| `MCPSlidingRateLimiter` | Request throttling | +| `MCPSecurityScanner` | Prompt-injection style request inspection | +| `MCPResponseScanner` | Output scanning and fail-closed blocking | +| `CredentialRedactor` | Safe audit logging | + +## Run the smoke test + +```bash +npm test +``` diff --git a/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/package.json b/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/package.json new file mode 100644 index 000000000..0a5bed3c1 --- /dev/null +++ b/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/package.json @@ -0,0 +1,26 @@ +{ + "name": "mcp-express-server-example", + "private": true, + "version": "0.0.1", + "description": "Public Preview — Express.js MCP governance example for the AgentMesh TypeScript SDK", + "type": "module", + "scripts": { + "start": "tsx src/server.ts", + "test": "tsx --test test/server.test.ts" + }, + "peerDependencies": { + "@microsoft/agentmesh-sdk": "3.0.2" + }, + "dependencies": { + "express": "4.21.2" + }, + "devDependencies": { + "@types/express": "5.0.3", + "@types/node": "25.5.0", + "tsx": "4.19.3", + "typescript": "5.7.3" + }, + "engines": { + "node": ">=18.0.0" + } +} diff --git a/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/src/server.ts b/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/src/server.ts new file mode 100644 index 000000000..b294a0156 --- /dev/null +++ b/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/src/server.ts @@ -0,0 +1,227 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { randomBytes } from 'node:crypto'; +import { pathToFileURL } from 'node:url'; +import express, { type Express } from 'express'; +import sdk from '../../../src/index.ts'; + +type ToolHandler = { + description: string; + inputSchema: Record; + run(args: Record): Promise>; +}; + +const toolHandlers: Record = { + search_docs: { + description: 'Search internal docs for a topic and return a concise answer.', + inputSchema: { + type: 'object', + properties: { query: { type: 'string' } }, + required: ['query'], + additionalProperties: false, + }, + async run(args) { + const query = readString(args, 'query') ?? 'agent governance'; + return { + answer: `Search results for "${query}"`, + source: 'docs://agentmesh/owasp-mcp', + }; + }, + }, + read_file: { + description: 'Read a file from the demo workspace and return its contents.', + inputSchema: { + type: 'object', + properties: { + path: { type: 'string' }, + approved: { type: 'boolean' }, + }, + required: ['path'], + additionalProperties: false, + }, + async run(args) { + const path = readString(args, 'path') ?? 'README.md'; + if (path.endsWith('secrets.txt')) { + return { + contents: 'Bearer abcdefghijklmnop demo@example.com', + path, + }; + } + return { + contents: `Read ${path} successfully.`, + path, + }; + }, + }, +}; + +const toolDefinitions = Object.entries(toolHandlers).map( + ([name, handler]) => ({ + name, + description: handler.description, + inputSchema: handler.inputSchema, + }), +); + +export function createExampleServer(): { + app: Express; + issueDemoSession(agentId?: string): Promise; +} { + const auditSink = new sdk.InMemoryMCPAuditSink(); + const redactor = new sdk.CredentialRedactor(); + const responseScanner = new sdk.MCPResponseScanner(); + const securityScanner = new sdk.MCPSecurityScanner(); + const routeRateLimiter = new sdk.MCPSlidingRateLimiter({ maxRequests: 5, windowMs: 60_000 }); + const gatewayRateLimiter = new sdk.MCPSlidingRateLimiter({ maxRequests: 5, windowMs: 60_000 }); + const sessionAuthenticator = new sdk.MCPSessionAuthenticator({ + secret: loadSecret('MCP_SESSION_SECRET'), + ttlMs: 5 * 60_000, + }); + const messageSigner = new sdk.MCPMessageSigner({ + secret: loadSecret('MCP_SIGNING_SECRET'), + }); + const gateway = new sdk.MCPGateway({ + allowedTools: Object.keys(toolHandlers), + sensitiveTools: ['read_file'], + blockedPatterns: ['../', '..\\\\', '', 'ignore previous instructions'], + rateLimiter: gatewayRateLimiter, + auditSink, + approvalHandler: async ({ toolName, params }) => + toolName === 'read_file' && params.approved !== true + ? sdk.ApprovalStatus.Pending + : sdk.ApprovalStatus.Approved, + }); + const catalogScan = securityScanner.scanServer('mcp-express-server', toolDefinitions); + const app = express(); + app.use(express.json()); + + app.get('/health', async (_request, response) => { + response.json({ + status: 'ok', + catalogSafe: catalogScan.safe, + toolCount: toolDefinitions.length, + demoAgentId: 'demo-agent', + demoSessionToken: await issueDemoSession('demo-agent'), + }); + }); + + // Rate limiting is applied via routeRateLimiter.consume() inside the handler + // codeql[js/missing-rate-limiting] + app.post('/call-tool', async (request, response) => { + const agentId = readString(request.body, 'agentId') ?? 'demo-agent'; + const toolName = readString(request.body, 'toolName') ?? ''; + const args = asRecord(request.body?.args); + const sessionToken = request.header('x-session-token'); + const handler = toolHandlers[toolName]; + + if (!sessionToken) { + response.status(401).json({ error: 'Missing x-session-token. Call GET /health for a demo token.' }); + return; + } + if (!handler) { + response.status(404).json({ error: `Unknown tool '${toolName}'` }); + return; + } + + const session = await sessionAuthenticator.verifyToken(sessionToken, agentId); + if (!session.valid) { + response.status(401).json({ error: session.reason }); + return; + } + + const signedCall = messageSigner.sign({ agentId, toolName, args }); + const signature = await messageSigner.verify(signedCall); + if (!signature.valid) { + response.status(401).json({ error: signature.reason }); + return; + } + + const routeRateLimit = await routeRateLimiter.consume(`${agentId}:${toolName}`); + if (!routeRateLimit.allowed) { + response.status(429).json({ + error: 'Rate limit exceeded for this tool', + rateLimit: routeRateLimit, + }); + return; + } + + const requestThreats = securityScanner.scanTool( + toolName, + `${handler.description}\nRequest payload: ${JSON.stringify(args)}`, + handler.inputSchema, + 'mcp-express-server', + ); + if (requestThreats.some((threat) => threat.severity === 'critical')) { + response.status(400).json({ error: 'Security scanner rejected the request', threats: requestThreats }); + return; + } + + const decision = await gateway.evaluateToolCall(agentId, toolName, args); + if (!decision.allowed) { + response.status(403).json({ + allowed: false, + reason: decision.reason, + findings: decision.findings, + auditParams: decision.auditParams, + }); + return; + } + + const rawResult = await handler.run(args); + const scannedResult = responseScanner.scan(rawResult); + const safeResult = { + safe: scannedResult.safe, + blocked: scannedResult.blocked, + findings: scannedResult.findings, + sanitized: scannedResult.sanitized, + }; + const logEntry = redactor.redact({ agentId, toolName, args, result: safeResult.sanitized }).redacted; + + if (process.env.NODE_ENV !== 'test') { + console.info('[mcp-express-server]', JSON.stringify(logEntry)); + } + + response.status(scannedResult.blocked ? 422 : 200).json({ + allowed: true, + reason: decision.reason, + messageVerification: signature, + response: safeResult, + auditEntries: auditSink.getEntries().length, + }); + }); + + async function issueDemoSession(agentId: string = 'demo-agent'): Promise { + const issued = await sessionAuthenticator.issueToken(agentId); + return issued.token; + } + + return { app, issueDemoSession }; +} + +function asRecord(value: unknown): Record { + return typeof value === 'object' && value !== null && !Array.isArray(value) + ? value as Record + : {}; +} + +function readString(value: unknown, key: string): string | undefined { + const record = asRecord(value); + return typeof record[key] === 'string' ? record[key] as string : undefined; +} + +function loadSecret(envName: string): string { + const secret = process.env[envName]; + if (secret && Buffer.byteLength(secret, 'utf-8') >= 32) { + return secret; + } + return randomBytes(32).toString('hex'); +} + +if (process.argv[1] && import.meta.url === pathToFileURL(process.argv[1]).href) { + const port = Number(process.env.PORT ?? 3000); + const { app } = createExampleServer(); + app.listen(port, () => { + console.log(`MCP Express example listening on http://127.0.0.1:${port}`); + }); +} diff --git a/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/test/server.test.ts b/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/test/server.test.ts new file mode 100644 index 000000000..935557aab --- /dev/null +++ b/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/test/server.test.ts @@ -0,0 +1,60 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import assert from 'node:assert/strict'; +import test from 'node:test'; +import { AddressInfo } from 'node:net'; +import { createExampleServer } from '../src/server.ts'; + +process.env.NODE_ENV = 'test'; + +test('health endpoint returns a demo session token', async () => { + const { app } = createExampleServer(); + const server = app.listen(0); + + try { + const { port } = server.address() as AddressInfo; + const response = await fetch(`http://127.0.0.1:${port}/health`); + const payload = await response.json() as { status: string; demoSessionToken: string }; + + assert.equal(response.status, 200); + assert.equal(payload.status, 'ok'); + assert.ok(payload.demoSessionToken.length > 20); + } finally { + await new Promise((resolve) => server.close(() => resolve())); + } +}); + +test('call-tool runs the governance pipeline', async () => { + const { app, issueDemoSession } = createExampleServer(); + const server = app.listen(0); + + try { + const { port } = server.address() as AddressInfo; + const token = await issueDemoSession('demo-agent'); + const response = await fetch(`http://127.0.0.1:${port}/call-tool`, { + method: 'POST', + headers: { + 'content-type': 'application/json', + 'x-session-token': token, + }, + body: JSON.stringify({ + agentId: 'demo-agent', + toolName: 'search_docs', + args: { query: 'OWASP MCP' }, + }), + }); + const payload = await response.json() as { + allowed: boolean; + messageVerification: { valid: boolean }; + response: { safe: boolean }; + }; + + assert.equal(response.status, 200); + assert.equal(payload.allowed, true); + assert.equal(payload.messageVerification.valid, true); + assert.equal(payload.response.safe, true); + } finally { + await new Promise((resolve) => server.close(() => resolve())); + } +}); diff --git a/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/tsconfig.json b/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/tsconfig.json new file mode 100644 index 000000000..5a268d4a3 --- /dev/null +++ b/packages/agent-mesh/sdks/typescript/examples/mcp-express-server/tsconfig.json @@ -0,0 +1,13 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "NodeNext", + "moduleResolution": "NodeNext", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "allowImportingTsExtensions": true, + "types": ["node"] + }, + "include": ["src/**/*.ts", "test/**/*.ts"] +} diff --git a/scripts/check_dependency_confusion.py b/scripts/check_dependency_confusion.py index 7b213f5e5..13532e1f4 100644 --- a/scripts/check_dependency_confusion.py +++ b/scripts/check_dependency_confusion.py @@ -1,431 +1,432 @@ -#!/usr/bin/env python3 -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. -"""Pre-commit hook: detect unregistered PyPI package names in pip install commands. - -Scans staged files for `pip install ` where is not a known -registered package. Prevents dependency confusion attacks. - -Usage: - # Install as pre-commit hook - cp scripts/check_dependency_confusion.py .git/hooks/pre-commit - chmod +x .git/hooks/pre-commit - - # Or run manually - python scripts/check_dependency_confusion.py [files...] -""" - -import argparse -import glob -import json -import re -import subprocess -import sys - -# Known registered PyPI package names for this project -REGISTERED_PACKAGES = { - # Core packages (on PyPI) - "agent-os-kernel", - "agentmesh-platform", - "agent-hypervisor", - "agentmesh-runtime", - "agent-sre", - "agent-governance-toolkit", - "agentmesh-lightning", - "agentmesh-marketplace", - # Common dependencies - "pydantic", "pyyaml", "cryptography", "pynacl", "httpx", "aiohttp", - "fastapi", "uvicorn", "structlog", "click", "rich", "numpy", "scipy", - "pytest", "pytest-asyncio", "pytest-cov", "ruff", "mypy", "build", - "openai", "anthropic", "langchain", "langchain-core", "crewai", - "redis", "sqlalchemy", "asyncpg", "chromadb", "pinecone-client", - "sentence-transformers", "prometheus-client", "opentelemetry-api", - "opentelemetry-sdk", "fhir.resources", "hl7apy", "zenpy", "freshdesk", - "google-adk", "safety", "jupyter", "vitest", "tsup", "typescript", - # Dashboard / visualization (used in examples) - "streamlit", "plotly", "pandas", "networkx", "matplotlib", "pyvis", - # Async / caching (used in examples) - "aioredis", "aiofiles", "aiosqlite", - # Document processing / NLP (used in examples) - "pypdf", "python-docx", "pdfplumber", "beautifulsoup4", "lxml", - "spacy", "nltk", "tiktoken", "scikit-learn", - # Dev tools - "black", "flake8", "types-PyYAML", - # Infrastructure / runtime (used in examples) - "docker", "huggingface-hub", "python-dotenv", "python-dateutil", - "python-multipart", "python-json-logger", "langchain-openai", - # Slack / messaging - "slack-sdk", "slack-bolt", - # Telemetry - "opentelemetry-instrumentation-fastapi", "opentelemetry-exporter-otlp", - "opentelemetry-instrumentation-httpx", "opentelemetry-instrumentation-asyncio", - # pyproject.toml optional-dependency group names (not real packages) - "dev", "cli", "all", "server", "storage", "observability", - "django", "websocket", "websockets", "grpc", "grpcio", "grpcio-tools", - "agent-os", "test", "docs", "full", "api", "otel", "protocols", - "runtime", "sandbox", "sre", "hypervisor", "iatp", "keywords", - "llm", "mcp", "hf", "huggingface", "blockchain", "web3", - "multi-agent", "broker-agnostic", "pubsub", "kafka", "rabbitmq", - "sql", "async", "nexus", "caas-core", "message-bus", - "ai-agents", "amb", "eval_type_backport", - # Integration packages / real PyPI packages used as deps - "hypothesis", "fakeredis", "langflow", "langgraph", - "agentmesh", "pydantic-ai", "haystack", "haystack-ai", "respx", - "langfuse", "arize", "arize-phoenix", "llamaindex", "braintrust", "helicone", - "datadog", "langsmith", "wandb", "mlflow", "agentops", - "typer", "jsonschema", "anyio", "pre-commit", "import-linter", - "mkdocs", "mkdocs-material", "mkdocstrings", "datasets", "sqlglot", - "aio-pika", "aiokafka", - # Cedar/OPA policy backends - "cedarpy", "llama-index-core", "ddtrace", - # Internal module references - "inter-agent-trust-protocol", "agent-control-plane", "cmvk", - "agent-tool-registry", "cedar", "opa", "huggingface_hub", - # APS adapter optional deps - "aps", "agent-passport-system", - # Internal cross-package references (local-only, NOT on PyPI) - # These are flagged as HIGH RISK if found in requirements.txt with version pins - # instead of path references. See dependency confusion attack vector. - "agent-primitives", "emk", - # With extras (base name is what matters) -} - -# Local-only packages that should NEVER appear with version pins in -# requirements.txt (they must use path references like -e ../primitives) -LOCAL_ONLY_PACKAGES = {"agent-primitives", "emk"} - -# Known npm packages for this project -REGISTERED_NPM_PACKAGES = { - "@microsoft/agent-os-kernel", "@microsoft/agentmesh-mcp-proxy", - "@microsoft/agentmesh-api", "@microsoft/agent-os-cursor", - "@microsoft/agentmesh-mastra", "@microsoft/agentmesh-copilot-governance", - "@microsoft/agent-os-copilot-extension", "@microsoft/agentos-mcp-server", - "@microsoft/agent-os-vscode", - # Common deps - "typescript", "tsup", "vitest", "express", "zod", "@mastra/core", - "@modelcontextprotocol/sdk", "ws", "commander", "chalk", - "@anthropic-ai/sdk", "@types/node", "@types/ws", "@types/express", - # Common npm dev dependencies - "eslint", "@typescript-eslint/parser", "@typescript-eslint/eslint-plugin", - "ts-jest", "@types/jest", "jest", "rimraf", "prettier", - "axios", "@types/vscode", "@vscode/vsce", "webpack", "webpack-cli", - "ts-node", "nodemon", "concurrently", "dotenv", - "esbuild", "@esbuild/linux-x64", "@esbuild/darwin-arm64", - # npm deps from extensions/copilot - "@octokit/webhooks", "path-to-regexp", "winston", - # npm deps from extensions/chrome - "react", "react-dom", "webextension-polyfill", - "@types/chrome", "@types/react", "@types/react-dom", - "copy-webpack-plugin", "css-loader", "eslint-plugin-react", - "eslint-plugin-react-hooks", "html-webpack-plugin", "style-loader", - "ts-loader", - # npm deps from extensions/mcp-server - "uuid", "yaml", "zod", "@types/uuid", "@vitest/coverage-v8", - # npm deps from mcp-proxy - "crypto-js", - # npm deps from sdks/typescript - "js-yaml", "@noble/ed25519", - # npm deps from agent-os-vscode - "@types/glob", "@types/mocha", "@vscode/test-electron", - "autoprefixer", "glob", "mocha", "postcss", "tailwindcss", -} - -# Known Cargo crate names -REGISTERED_CARGO_PACKAGES = { - "serde", "serde_json", "serde_yaml", "sha2", "ed25519-dalek", - "rand", "thiserror", "tempfile", "agentmesh", -} - -# Patterns that are always safe (not package names) -SAFE_PATTERNS = { - "-e", "--editable", "-r", "--requirement", "--upgrade", "--no-cache-dir", - "--quiet", "--require-hashes", "--hash", ".", "..", "../..", - "pip", "install", "%pip", -} - -PIP_INSTALL_RE = re.compile( - r'(?:%?pip)\s+install\s+(.+?)(?:\s*\\?\s*$|(?=\s*&&|\s*\||\s*;|\s*#))', - re.MULTILINE, -) - - -def extract_package_names(install_args: str) -> list[str]: - """Extract package names from a pip install argument string.""" - packages = [] - for token in install_args.split(): - # Skip flags - if token.startswith("-") or token in SAFE_PATTERNS: - continue - if token.startswith((".", "/", "\\", "http", "git+")): - continue - # Skip tokens that look like code, not package names - if any(c in token for c in ('(', ')', '=', '"', "'", ":")): - continue - # Strip extras: package[extra] -> package - base = re.sub(r'\[.*\]', '', token) - # Strip version specifiers: package>=1.0 -> package - base = re.split(r'[><=!~]', base)[0] - # Strip markdown/quote artifacts - base = base.strip('`"\'(){}%') - if base and base not in SAFE_PATTERNS: - packages.append(base) - return packages - - -def check_file(filepath: str) -> list[str]: - """Check a file for potentially unregistered pip install targets.""" - findings = [] - try: - with open(filepath, encoding="utf-8", errors="ignore") as f: - content = f.read() - except (OSError, UnicodeDecodeError): - return findings - - for match in PIP_INSTALL_RE.finditer(content): - line_num = content[:match.start()].count("\n") + 1 - packages = extract_package_names(match.group(1)) - for pkg in packages: - if pkg.lower() not in {p.lower() for p in REGISTERED_PACKAGES}: - findings.append( - f" {filepath}:{line_num}: " - f"'{pkg}' may not be registered on PyPI" - ) - return findings - - -def check_requirements_file(filepath: str) -> list[str]: - """Check a requirements*.txt file for unregistered package names.""" - findings = [] - try: - with open(filepath, encoding="utf-8", errors="ignore") as f: - lines = f.readlines() - except (OSError, UnicodeDecodeError): - return findings - - registered_lower = {p.lower() for p in REGISTERED_PACKAGES} - for line_num, line in enumerate(lines, 1): - line = line.strip() - if not line or line.startswith("#") or line.startswith("-"): - continue - if line.startswith((".", "/", "\\", "http", "git+")): - continue - # Strip extras and version specifiers - base = re.sub(r'\[.*\]', '', line) - base = re.split(r'[><=!~;@\s]', base)[0].strip() - if base and base.lower() not in registered_lower: - findings.append( - f" {filepath}:{line_num}: " - f"'{base}' may not be registered on PyPI" - ) - return findings - - -def check_notebook(filepath: str) -> list[str]: - """Check a Jupyter notebook for pip install of unregistered packages.""" - findings = [] - try: - with open(filepath, encoding="utf-8", errors="ignore") as f: - nb = json.load(f) - except (OSError, json.JSONDecodeError, UnicodeDecodeError): - return findings - - registered_lower = {p.lower() for p in REGISTERED_PACKAGES} - for cell in nb.get("cells", []): - for line in cell.get("source", []): - if "pip install" in line and not line.strip().startswith("#"): - packages = extract_package_names(line) - for pkg in packages: - if pkg.lower() not in registered_lower: - findings.append( - f" {filepath}: " - f"'{pkg}' may not be registered on PyPI" - ) - return findings - - -def check_pyproject_toml(filepath: str) -> list[str]: - """Check a pyproject.toml for unregistered package dependencies.""" - findings = [] - try: - with open(filepath, encoding="utf-8", errors="ignore") as f: - content = f.read() - except (OSError, UnicodeDecodeError): - return findings - - registered_lower = {p.lower() for p in REGISTERED_PACKAGES} - # Match dependency lines like: "package>=1.0" or "package[extra]>=1.0,<2.0" - dep_re = re.compile(r'^[\s"]*([a-zA-Z0-9_-]+)', re.MULTILINE) - in_deps = False - in_optional = False - for line_num, line in enumerate(content.splitlines(), 1): - stripped = line.strip() - if stripped.startswith("[project.dependencies]"): - in_deps = True - in_optional = False - continue - if stripped.startswith("[project.optional-dependencies"): - in_deps = True - in_optional = True - continue - if stripped.startswith("[") and in_deps: - in_deps = False - in_optional = False - continue - if not in_deps: - continue - if not stripped or stripped.startswith("#"): - continue - # In optional-dependencies, lines like 'aps = ["pkg>=1.0"]' are group - # headers — the key (aps) is an extras name, not a package. Parse the - # values inside the brackets instead. - if in_optional and re.match(r'^[a-zA-Z0-9_-]+\s*=\s*\[', stripped): - # Extract package names from the bracket contents - bracket_content = stripped.split("[", 1)[1].rstrip("]").strip() - for item in bracket_content.split(","): - item = item.strip().strip('"').strip("'") - if item: - base = re.split(r'[><=!~;@\s]', item)[0].strip() - if base and base.lower() not in registered_lower: - findings.append( - f" {filepath}:{line_num}: '{base}' may not be registered on PyPI" - ) - continue - m = dep_re.match(stripped.strip('"').strip("'").strip(",")) - if m: - pkg = m.group(1) - if pkg.lower() not in registered_lower and pkg.lower() not in { - "python", "requires-python", - }: - severity = "HIGH RISK" if pkg.lower() in { - p.lower() for p in LOCAL_ONLY_PACKAGES - } else "" - msg = f" {filepath}:{line_num}: '{pkg}' may not be registered on PyPI" - if severity: - msg += f" [{severity}: local-only package]" - findings.append(msg) - return findings - - -def check_package_json(filepath: str) -> list[str]: - """Check a package.json for unregistered npm package dependencies.""" - findings = [] - try: - with open(filepath, encoding="utf-8", errors="ignore") as f: - data = json.load(f) - except (OSError, json.JSONDecodeError, UnicodeDecodeError): - return findings - - registered_lower = {p.lower() for p in REGISTERED_NPM_PACKAGES} - for section in ("dependencies", "devDependencies", "peerDependencies"): - for pkg in data.get(section, {}): - if pkg.lower() not in registered_lower: - findings.append( - f" {filepath}: npm '{pkg}' ({section}) may not be registered" - ) - return findings - - -def check_cargo_toml(filepath: str) -> list[str]: - """Check a Cargo.toml for unregistered crate dependencies.""" - findings = [] - try: - with open(filepath, encoding="utf-8", errors="ignore") as f: - content = f.read() - except (OSError, UnicodeDecodeError): - return findings - - registered_lower = {p.lower() for p in REGISTERED_CARGO_PACKAGES} - in_deps = False - for line_num, line in enumerate(content.splitlines(), 1): - stripped = line.strip() - if stripped in ("[dependencies]", "[dev-dependencies]", - "[build-dependencies]"): - in_deps = True - continue - if stripped.startswith("[") and in_deps: - in_deps = False - continue - if not in_deps or not stripped or stripped.startswith("#"): - continue - m = re.match(r'^([a-zA-Z0-9_-]+)\s*=', stripped) - if m: - crate = m.group(1) - if crate.lower() not in registered_lower: - findings.append( - f" {filepath}:{line_num}: crate '{crate}' " - f"may not be registered on crates.io" - ) - return findings - - -def main() -> int: - parser = argparse.ArgumentParser( - description="Detect unregistered PyPI package names in pip install commands.", - ) - parser.add_argument( - "--strict", action="store_true", - help="Also scan notebooks and requirements*.txt files; exit 1 on any violation", - ) - parser.add_argument("files", nargs="*", help="Files to check") - args = parser.parse_args() - - # Get files to check - if args.files: - files = args.files - else: - # Pre-commit mode: check staged files - result = subprocess.run( - ["git", "diff", "--cached", "--name-only", "--diff-filter=ACM"], - capture_output=True, text=True, - ) - files = [ - f for f in result.stdout.strip().split("\n") - if f.endswith((".md", ".py", ".ts", ".txt", ".yaml", ".yml", ".ipynb", ".svg")) - ] - - all_findings = [] - for f in files: - all_findings.extend(check_file(f)) - - # --strict: additionally scan all notebooks, requirements, and manifest files - if args.strict: - for nb in glob.glob("**/*.ipynb", recursive=True): - if "node_modules" in nb or ".ipynb_checkpoints" in nb: - continue - all_findings.extend(check_notebook(nb)) - - for req in glob.glob("**/requirements*.txt", recursive=True): - if "node_modules" in req: - continue - all_findings.extend(check_requirements_file(req)) - - for pyproj in glob.glob("**/pyproject.toml", recursive=True): - if "node_modules" in pyproj: - continue - all_findings.extend(check_pyproject_toml(pyproj)) - - for pkgjson in glob.glob("**/package.json", recursive=True): - if "node_modules" in pkgjson: - continue - all_findings.extend(check_package_json(pkgjson)) - - for cargo in glob.glob("**/Cargo.toml", recursive=True): - if "node_modules" in cargo: - continue - all_findings.extend(check_cargo_toml(cargo)) - - if all_findings: - print("⚠️ Potential dependency confusion detected:") - print() - for finding in all_findings: - print(finding) - print() - print("If the package IS registered on PyPI, add it to REGISTERED_PACKAGES") - print("in scripts/check_dependency_confusion.py") - return 1 - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Pre-commit hook: detect unregistered PyPI package names in pip install commands. + +Scans staged files for `pip install ` where is not a known +registered package. Prevents dependency confusion attacks. + +Usage: + # Install as pre-commit hook + cp scripts/check_dependency_confusion.py .git/hooks/pre-commit + chmod +x .git/hooks/pre-commit + + # Or run manually + python scripts/check_dependency_confusion.py [files...] +""" + +import argparse +import glob +import json +import re +import subprocess +import sys + +# Known registered PyPI package names for this project +REGISTERED_PACKAGES = { + # Core packages (on PyPI) + "agent-os-kernel", + "agentmesh-platform", + "agent-hypervisor", + "agentmesh-runtime", + "agent-sre", + "agent-governance-toolkit", + "agentmesh-lightning", + "agentmesh-marketplace", + # Common dependencies + "pydantic", "pyyaml", "cryptography", "pynacl", "httpx", "aiohttp", + "fastapi", "uvicorn", "structlog", "click", "rich", "numpy", "scipy", + "pytest", "pytest-asyncio", "pytest-cov", "ruff", "mypy", "build", + "openai", "anthropic", "langchain", "langchain-core", "crewai", + "redis", "sqlalchemy", "asyncpg", "chromadb", "pinecone-client", + "sentence-transformers", "prometheus-client", "opentelemetry-api", + "opentelemetry-sdk", "fhir.resources", "hl7apy", "zenpy", "freshdesk", + "google-adk", "safety", "jupyter", "vitest", "tsup", "typescript", + # Dashboard / visualization (used in examples) + "streamlit", "plotly", "pandas", "networkx", "matplotlib", "pyvis", + # Async / caching (used in examples) + "aioredis", "aiofiles", "aiosqlite", + # Document processing / NLP (used in examples) + "pypdf", "python-docx", "pdfplumber", "beautifulsoup4", "lxml", + "spacy", "nltk", "tiktoken", "scikit-learn", + # Dev tools + "black", "flake8", "types-PyYAML", + # Infrastructure / runtime (used in examples) + "docker", "huggingface-hub", "python-dotenv", "python-dateutil", + "python-multipart", "python-json-logger", "langchain-openai", + # Slack / messaging + "slack-sdk", "slack-bolt", + # Telemetry + "opentelemetry-instrumentation-fastapi", "opentelemetry-exporter-otlp", + "opentelemetry-instrumentation-httpx", "opentelemetry-instrumentation-asyncio", + # pyproject.toml optional-dependency group names (not real packages) + "dev", "cli", "all", "server", "storage", "observability", + "django", "websocket", "websockets", "grpc", "grpcio", "grpcio-tools", + "agent-os", "test", "docs", "full", "api", "otel", "protocols", + "runtime", "sandbox", "sre", "hypervisor", "iatp", "keywords", + "llm", "mcp", "hf", "huggingface", "blockchain", "web3", + "multi-agent", "broker-agnostic", "pubsub", "kafka", "rabbitmq", + "sql", "async", "nexus", "caas-core", "message-bus", + "ai-agents", "amb", "eval_type_backport", + # Integration packages / real PyPI packages used as deps + "hypothesis", "fakeredis", "langflow", "langgraph", + "agentmesh", "pydantic-ai", "haystack", "haystack-ai", "respx", + "langfuse", "arize", "arize-phoenix", "llamaindex", "braintrust", "helicone", + "datadog", "langsmith", "wandb", "mlflow", "agentops", + "typer", "jsonschema", "anyio", "pre-commit", "import-linter", + "mkdocs", "mkdocs-material", "mkdocstrings", "datasets", "sqlglot", + "aio-pika", "aiokafka", + # Cedar/OPA policy backends + "cedarpy", "llama-index-core", "ddtrace", + # Internal module references + "inter-agent-trust-protocol", "agent-control-plane", "cmvk", + "agent-tool-registry", "cedar", "opa", "huggingface_hub", + # APS adapter optional deps + "aps", "agent-passport-system", + # Internal cross-package references (local-only, NOT on PyPI) + # These are flagged as HIGH RISK if found in requirements.txt with version pins + # instead of path references. See dependency confusion attack vector. + "agent-primitives", "emk", + # With extras (base name is what matters) +} + +# Local-only packages that should NEVER appear with version pins in +# requirements.txt (they must use path references like -e ../primitives) +LOCAL_ONLY_PACKAGES = {"agent-primitives", "emk"} + +# Known npm packages for this project +REGISTERED_NPM_PACKAGES = { + "@microsoft/agent-os-kernel", "@microsoft/agentmesh-mcp-proxy", + "@microsoft/agentmesh-api", "@microsoft/agent-os-cursor", + "@microsoft/agentmesh-mastra", "@microsoft/agentmesh-copilot-governance", + "@microsoft/agent-os-copilot-extension", "@microsoft/agentos-mcp-server", + "@microsoft/agent-os-vscode", "@microsoft/agentmesh-sdk", + "@microsoft/agentmesh-mcp-governance", + # Common deps + "typescript", "tsup", "vitest", "express", "zod", "@mastra/core", + "@modelcontextprotocol/sdk", "ws", "commander", "chalk", + "@anthropic-ai/sdk", "@types/node", "@types/ws", "@types/express", + # Common npm dev dependencies + "eslint", "@typescript-eslint/parser", "@typescript-eslint/eslint-plugin", + "ts-jest", "@types/jest", "jest", "rimraf", "prettier", "tsx", + "axios", "@types/vscode", "@vscode/vsce", "webpack", "webpack-cli", + "ts-node", "nodemon", "concurrently", "dotenv", + "esbuild", "@esbuild/linux-x64", "@esbuild/darwin-arm64", + # npm deps from extensions/copilot + "@octokit/webhooks", "path-to-regexp", "winston", + # npm deps from extensions/chrome + "react", "react-dom", "webextension-polyfill", + "@types/chrome", "@types/react", "@types/react-dom", + "copy-webpack-plugin", "css-loader", "eslint-plugin-react", + "eslint-plugin-react-hooks", "html-webpack-plugin", "style-loader", + "ts-loader", + # npm deps from extensions/mcp-server + "uuid", "yaml", "zod", "@types/uuid", "@vitest/coverage-v8", + # npm deps from mcp-proxy + "crypto-js", + # npm deps from sdks/typescript + "js-yaml", "@noble/ed25519", + # npm deps from agent-os-vscode + "@types/glob", "@types/mocha", "@vscode/test-electron", + "autoprefixer", "glob", "mocha", "postcss", "tailwindcss", +} + +# Known Cargo crate names +REGISTERED_CARGO_PACKAGES = { + "serde", "serde_json", "serde_yaml", "sha2", "ed25519-dalek", + "rand", "thiserror", "tempfile", "agentmesh", +} + +# Patterns that are always safe (not package names) +SAFE_PATTERNS = { + "-e", "--editable", "-r", "--requirement", "--upgrade", "--no-cache-dir", + "--quiet", "--require-hashes", "--hash", ".", "..", "../..", + "pip", "install", "%pip", +} + +PIP_INSTALL_RE = re.compile( + r'(?:%?pip)\s+install\s+(.+?)(?:\s*\\?\s*$|(?=\s*&&|\s*\||\s*;|\s*#))', + re.MULTILINE, +) + + +def extract_package_names(install_args: str) -> list[str]: + """Extract package names from a pip install argument string.""" + packages = [] + for token in install_args.split(): + # Skip flags + if token.startswith("-") or token in SAFE_PATTERNS: + continue + if token.startswith((".", "/", "\\", "http", "git+")): + continue + # Skip tokens that look like code, not package names + if any(c in token for c in ('(', ')', '=', '"', "'", ":")): + continue + # Strip extras: package[extra] -> package + base = re.sub(r'\[.*\]', '', token) + # Strip version specifiers: package>=1.0 -> package + base = re.split(r'[><=!~]', base)[0] + # Strip markdown/quote artifacts + base = base.strip('`"\'(){}%') + if base and base not in SAFE_PATTERNS: + packages.append(base) + return packages + + +def check_file(filepath: str) -> list[str]: + """Check a file for potentially unregistered pip install targets.""" + findings = [] + try: + with open(filepath, encoding="utf-8", errors="ignore") as f: + content = f.read() + except (OSError, UnicodeDecodeError): + return findings + + for match in PIP_INSTALL_RE.finditer(content): + line_num = content[:match.start()].count("\n") + 1 + packages = extract_package_names(match.group(1)) + for pkg in packages: + if pkg.lower() not in {p.lower() for p in REGISTERED_PACKAGES}: + findings.append( + f" {filepath}:{line_num}: " + f"'{pkg}' may not be registered on PyPI" + ) + return findings + + +def check_requirements_file(filepath: str) -> list[str]: + """Check a requirements*.txt file for unregistered package names.""" + findings = [] + try: + with open(filepath, encoding="utf-8", errors="ignore") as f: + lines = f.readlines() + except (OSError, UnicodeDecodeError): + return findings + + registered_lower = {p.lower() for p in REGISTERED_PACKAGES} + for line_num, line in enumerate(lines, 1): + line = line.strip() + if not line or line.startswith("#") or line.startswith("-"): + continue + if line.startswith((".", "/", "\\", "http", "git+")): + continue + # Strip extras and version specifiers + base = re.sub(r'\[.*\]', '', line) + base = re.split(r'[><=!~;@\s]', base)[0].strip() + if base and base.lower() not in registered_lower: + findings.append( + f" {filepath}:{line_num}: " + f"'{base}' may not be registered on PyPI" + ) + return findings + + +def check_notebook(filepath: str) -> list[str]: + """Check a Jupyter notebook for pip install of unregistered packages.""" + findings = [] + try: + with open(filepath, encoding="utf-8", errors="ignore") as f: + nb = json.load(f) + except (OSError, json.JSONDecodeError, UnicodeDecodeError): + return findings + + registered_lower = {p.lower() for p in REGISTERED_PACKAGES} + for cell in nb.get("cells", []): + for line in cell.get("source", []): + if "pip install" in line and not line.strip().startswith("#"): + packages = extract_package_names(line) + for pkg in packages: + if pkg.lower() not in registered_lower: + findings.append( + f" {filepath}: " + f"'{pkg}' may not be registered on PyPI" + ) + return findings + + +def check_pyproject_toml(filepath: str) -> list[str]: + """Check a pyproject.toml for unregistered package dependencies.""" + findings = [] + try: + with open(filepath, encoding="utf-8", errors="ignore") as f: + content = f.read() + except (OSError, UnicodeDecodeError): + return findings + + registered_lower = {p.lower() for p in REGISTERED_PACKAGES} + # Match dependency lines like: "package>=1.0" or "package[extra]>=1.0,<2.0" + dep_re = re.compile(r'^[\s"]*([a-zA-Z0-9_-]+)', re.MULTILINE) + in_deps = False + in_optional = False + for line_num, line in enumerate(content.splitlines(), 1): + stripped = line.strip() + if stripped.startswith("[project.dependencies]"): + in_deps = True + in_optional = False + continue + if stripped.startswith("[project.optional-dependencies"): + in_deps = True + in_optional = True + continue + if stripped.startswith("[") and in_deps: + in_deps = False + in_optional = False + continue + if not in_deps: + continue + if not stripped or stripped.startswith("#"): + continue + # In optional-dependencies, lines like 'aps = ["pkg>=1.0"]' are group + # headers — the key (aps) is an extras name, not a package. Parse the + # values inside the brackets instead. + if in_optional and re.match(r'^[a-zA-Z0-9_-]+\s*=\s*\[', stripped): + # Extract package names from the bracket contents + bracket_content = stripped.split("[", 1)[1].rstrip("]").strip() + for item in bracket_content.split(","): + item = item.strip().strip('"').strip("'") + if item: + base = re.split(r'[><=!~;@\s]', item)[0].strip() + if base and base.lower() not in registered_lower: + findings.append( + f" {filepath}:{line_num}: '{base}' may not be registered on PyPI" + ) + continue + m = dep_re.match(stripped.strip('"').strip("'").strip(",")) + if m: + pkg = m.group(1) + if pkg.lower() not in registered_lower and pkg.lower() not in { + "python", "requires-python", + }: + severity = "HIGH RISK" if pkg.lower() in { + p.lower() for p in LOCAL_ONLY_PACKAGES + } else "" + msg = f" {filepath}:{line_num}: '{pkg}' may not be registered on PyPI" + if severity: + msg += f" [{severity}: local-only package]" + findings.append(msg) + return findings + + +def check_package_json(filepath: str) -> list[str]: + """Check a package.json for unregistered npm package dependencies.""" + findings = [] + try: + with open(filepath, encoding="utf-8", errors="ignore") as f: + data = json.load(f) + except (OSError, json.JSONDecodeError, UnicodeDecodeError): + return findings + + registered_lower = {p.lower() for p in REGISTERED_NPM_PACKAGES} + for section in ("dependencies", "devDependencies", "peerDependencies"): + for pkg in data.get(section, {}): + if pkg.lower() not in registered_lower: + findings.append( + f" {filepath}: npm '{pkg}' ({section}) may not be registered" + ) + return findings + + +def check_cargo_toml(filepath: str) -> list[str]: + """Check a Cargo.toml for unregistered crate dependencies.""" + findings = [] + try: + with open(filepath, encoding="utf-8", errors="ignore") as f: + content = f.read() + except (OSError, UnicodeDecodeError): + return findings + + registered_lower = {p.lower() for p in REGISTERED_CARGO_PACKAGES} + in_deps = False + for line_num, line in enumerate(content.splitlines(), 1): + stripped = line.strip() + if stripped in ("[dependencies]", "[dev-dependencies]", + "[build-dependencies]"): + in_deps = True + continue + if stripped.startswith("[") and in_deps: + in_deps = False + continue + if not in_deps or not stripped or stripped.startswith("#"): + continue + m = re.match(r'^([a-zA-Z0-9_-]+)\s*=', stripped) + if m: + crate = m.group(1) + if crate.lower() not in registered_lower: + findings.append( + f" {filepath}:{line_num}: crate '{crate}' " + f"may not be registered on crates.io" + ) + return findings + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Detect unregistered PyPI package names in pip install commands.", + ) + parser.add_argument( + "--strict", action="store_true", + help="Also scan notebooks and requirements*.txt files; exit 1 on any violation", + ) + parser.add_argument("files", nargs="*", help="Files to check") + args = parser.parse_args() + + # Get files to check + if args.files: + files = args.files + else: + # Pre-commit mode: check staged files + result = subprocess.run( + ["git", "diff", "--cached", "--name-only", "--diff-filter=ACM"], + capture_output=True, text=True, + ) + files = [ + f for f in result.stdout.strip().split("\n") + if f.endswith((".md", ".py", ".ts", ".txt", ".yaml", ".yml", ".ipynb", ".svg")) + ] + + all_findings = [] + for f in files: + all_findings.extend(check_file(f)) + + # --strict: additionally scan all notebooks, requirements, and manifest files + if args.strict: + for nb in glob.glob("**/*.ipynb", recursive=True): + if "node_modules" in nb or ".ipynb_checkpoints" in nb: + continue + all_findings.extend(check_notebook(nb)) + + for req in glob.glob("**/requirements*.txt", recursive=True): + if "node_modules" in req: + continue + all_findings.extend(check_requirements_file(req)) + + for pyproj in glob.glob("**/pyproject.toml", recursive=True): + if "node_modules" in pyproj: + continue + all_findings.extend(check_pyproject_toml(pyproj)) + + for pkgjson in glob.glob("**/package.json", recursive=True): + if "node_modules" in pkgjson: + continue + all_findings.extend(check_package_json(pkgjson)) + + for cargo in glob.glob("**/Cargo.toml", recursive=True): + if "node_modules" in cargo: + continue + all_findings.extend(check_cargo_toml(cargo)) + + if all_findings: + print("⚠️ Potential dependency confusion detected:") + print() + for finding in all_findings: + print(finding) + print() + print("If the package IS registered on PyPI, add it to REGISTERED_PACKAGES") + print("in scripts/check_dependency_confusion.py") + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main())