diff --git a/.agents/skills/handoff/SKILL.md b/.agents/skills/handoff/SKILL.md new file mode 100644 index 00000000..0100ece7 --- /dev/null +++ b/.agents/skills/handoff/SKILL.md @@ -0,0 +1,29 @@ +--- +name: handoff +description: Create a self-contained Jetmon handoff for another agent. +--- + +# Jetmon Handoff + +Use this when Chris asks for a handoff doc or wants another agent to continue a +Jetmon thread. + +## Include + +- Repo path, branch, and relevant commit IDs. +- Whether the work affects Jetmon v1, Jetmon v2, Veriflier, bridge, support + services, or uptime-bench. +- Active test locks and what must not be changed. +- Problem statement, evidence, and current hypothesis. +- Relevant logs, reports, metrics, PRs, and file paths. +- Commands already run and their outcome. +- Next recommended actions and approvals needed. + +## Placement + +During active tests, prefer `.agents` or global memory for agent-only handoffs. +Ask before editing non-agent project docs. + +## Secrets + +Do not include tokens, passwords, private keys, or unredacted service configs. diff --git a/.agents/skills/jetmon-test-fleet/SKILL.md b/.agents/skills/jetmon-test-fleet/SKILL.md new file mode 100644 index 00000000..d77e652e --- /dev/null +++ b/.agents/skills/jetmon-test-fleet/SKILL.md @@ -0,0 +1,44 @@ +--- +name: jetmon-test-fleet +description: Work safely with Jetmon services used by uptime-bench capacity tests. +--- + +# Jetmon Test Fleet + +Use this when Chris asks about Jetmon v1/v2 test services, Verifliers, support +services, Prometheus capacity data, or whether a Jetmon branch is ready for +uptime-bench tests. + +## Safety First + +- If tests are running, do not restart services, change config, move support + services, deploy binaries, mutate databases, or alter target/provider state + without explicit permission. +- Prefer read-only inspection and report analysis during active tests. +- State which repo is being acted on before making changes. + +## Common Context + +- Uptime-bench canonical repo: + `/home/gaarai/code/uptime-bench`. +- Current Prometheus for Jetmon capacity work: + `http://10.0.0.67:9091`. +- Service hosts: + `jetmon-service-host-1`/`jetmon-v1`, + `jetmon-service-host-2`/`jetmon-v2`, + `jetmon-service-host-3`, + `jetmon-service-host-4`. +- Support/monitoring hosts: + `jetmon-vm-host-1`, + `jetmon-vm-host-2`, + `jetmon-vm-host-3`. + +## Output Expectations + +When answering readiness or risk questions, include: + +- Branch and commit under discussion. +- What is deployed versus only local. +- Which checks were read-only. +- Whether changes are safe during an active uptime-bench run. +- Recommended next action and any approval needed. diff --git a/.agents/skills/safe-background-work/SKILL.md b/.agents/skills/safe-background-work/SKILL.md new file mode 100644 index 00000000..087c70a7 --- /dev/null +++ b/.agents/skills/safe-background-work/SKILL.md @@ -0,0 +1,31 @@ +--- +name: safe-background-work +description: Pick useful Jetmon work that cannot affect active uptime-bench or Jetmon tests. +--- + +# Safe Background Work + +Use this when tests are running and Chris asks what can be done without +interrupting them. + +## Allowed By Default + +- Local code review and static analysis. +- Agent-specific files. +- Branch inspection and commit comparison. +- Handoff writing. +- Local-only planning for changes that will not be deployed. + +## Ask First + +- Deploying binaries or configs. +- Restarting `jetmon2`, Jetmon v1, bridge, Veriflier, database, StatsD, or + monitoring services. +- Moving support services between hosts. +- Changing bucket ownership, pinned bucket ranges, or test fleet data. +- Running smoke tests that create, delete, or modify sites/providers. + +## Blocker Policy + +If a safe task becomes blocked on approval, record the blocker and move to the +next safe task. diff --git a/.claude/commands/debug-memory.md b/.claude/commands/debug-memory.md index ddb502ad..0d6966cc 100644 --- a/.claude/commands/debug-memory.md +++ b/.claude/commands/debug-memory.md @@ -1,97 +1,66 @@ # Debug Memory Issues -Debug memory issues in Jetmon workers and identify leaks. +Debug memory growth and goroutine leaks in the Jetmon 2 Go binary. ## Instructions -Help the user diagnose memory problems in Jetmon workers. Memory leaks are a known pitfall because workers are long-running processes. +Help the user diagnose memory problems in Jetmon 2. Unlike the old Node.js/worker architecture, +Jetmon 2 is a single Go binary. Memory pressure does not cause worker crashes — instead the +orchestrator drains the goroutine pool when RSS exceeds `WORKER_MAX_MEM_MB`. ### 1. Check Current Memory Status -First, see current memory usage of all Jetmon processes: ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon ps aux --sort=-%mem | grep -E '(node|PID)' | head -20 +cd docker && docker compose exec jetmon ps aux ``` -Check worker memory limits in config: +Check memory config: ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon cat config/config.json | grep -E '(WORKER_MAX_MEM|WORKER_MAX_CHECK)' +docker compose exec jetmon cat config/config.json | grep -E '(WORKER_MAX_MEM|NUM_WORKERS)' ``` -### 2. Monitor Memory Over Time +### 2. Use pprof for Deep Analysis -Watch memory growth in real-time: -```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon bash -c 'while true; do echo "=== $(date) ==="; ps aux --sort=-%mem | grep node | head -10; sleep 10; done' -``` - -Let this run for a few minutes to observe trends. Look for: -- Workers steadily increasing memory without recycling -- Workers approaching or exceeding `WORKER_MAX_MEM_MB` (default 53MB) -- Memory not dropping after worker recycle - -### 3. Check Worker Recycling +The operator dashboard exposes pprof endpoints at http://localhost:8080/debug/pprof/ -Verify workers are being recycled when hitting limits: ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose logs jetmon 2>&1 | grep -E '(memory|recycle|spawn|die|limit)' | tail -30 -``` - -### 4. Force Aggressive Recycling (Testing) +# Count goroutines +curl http://localhost:8080/debug/pprof/goroutine?debug=1 | grep -c "^goroutine" -To test worker recycling behavior, temporarily set low limits: - -```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon bash -c 'cat > /tmp/test-config.json << EOF -{ - "WORKER_MAX_CHECKS": 50, - "WORKER_MAX_MEM_MB": 20 -} -EOF -cat /tmp/test-config.json' +# Heap profile +curl http://localhost:8080/debug/pprof/heap > heap.prof +go tool pprof heap.prof ``` -Tell the user to manually update `config/config.json` with these values, then reload: -```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon sh -c 'kill -HUP $(pgrep -f "node lib/jetmon.js" | head -1)' -``` +### 3. Monitor Memory Over Time -Watch for recycling: ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose logs -f jetmon 2>&1 | grep -E '(spawn|die|recycle|memory|limit)' +docker compose exec jetmon bash -c 'while true; do ps -o pid,rss,vsz,comm -p $(pgrep jetmon2); sleep 10; done' ``` -### 5. Check for Known Memory Issues +Enable detailed StatsD metrics by setting `STATSD_SEND_MEM_USAGE: true` in `config/config.json`, +then reload config: `docker compose exec jetmon ./jetmon2 reload` -**Retry queue growth:** If retry queues aren't being processed, they can grow unbounded: -```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose logs jetmon 2>&1 | grep -i retry | tail -20 -``` +### 4. Check Retry Queue Size + +Large retry queues indicate many sites are down and being tracked. This is expected behaviour. -**StatsD buffer:** Check if metrics buffer is growing: ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon bash -c 'cat stats/* 2>/dev/null' +curl http://localhost:8080/api/state | python3 -m json.tool ``` -### 6. Analyze with Node.js Tools (Advanced) - -If deeper analysis is needed, suggest: - -1. **Heap snapshots:** Would require code changes to expose `v8.writeHeapSnapshot()` -2. **--inspect flag:** Could attach Chrome DevTools, but requires exposing debug port -3. **Process stats:** Check `/proc//status` for detailed memory breakdown +Look at `RetryQueueSize`. -### 7. Common Memory Issues in Jetmon +### 5. Common Issues | Symptom | Likely Cause | Fix | |---------|--------------|-----| -| Workers never recycle | `WORKER_MAX_MEM_MB` set to 0 or very high | Set reasonable limit (53MB default) | -| Memory spikes during rounds | Too many concurrent checks | Reduce `NUM_TO_PROCESS` | -| Gradual leak over hours | Retry queue not draining | Check Veriflier connectivity | -| Sudden OOM | Node.js version regression | Test with previous Node version | +| Goroutine count grows | Context not cancelled on shutdown | Verify `orch.Stop()` called | +| Memory never drops | Pool drain not triggered | Check `WORKER_MAX_MEM_MB` value | +| Retry queue unbounded | Veriflier unreachable | Check veriflier connectivity | +| High allocations | Keyword-check body reads | Reduce `NUM_WORKERS` | -### 8. Restore Normal Settings +### 6. Restore Normal Settings -Remind user to restore normal config values after testing: -- `WORKER_MAX_MEM_MB`: 53 -- `WORKER_MAX_CHECKS`: 10000 +After testing, remind user to restore: +- `STATSD_SEND_MEM_USAGE`: false (avoid extra StatsD traffic in production) diff --git a/.claude/commands/docker-test.md b/.claude/commands/docker-test.md index 4c6b1d06..9a942f18 100644 --- a/.claude/commands/docker-test.md +++ b/.claude/commands/docker-test.md @@ -1,68 +1,80 @@ # Docker Test Environment -Run, debug, and test Jetmon using the Docker development environment. +Run, debug, and test Jetmon 2 using the Docker development environment. ## Instructions -Help the user test Jetmon in the Docker environment. Follow these steps: +Help the user test Jetmon 2 in the Docker environment. Follow these steps: ### 1. Check Docker Status First, check if the Docker environment is already running: ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose ps +cd docker && docker compose ps ``` ### 2. Start Services (if needed) If services aren't running, start them: ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose up -d +cd docker && docker compose up -d ``` Wait a few seconds for services to initialize, then verify: ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose ps +docker compose ps ``` ### 3. Ask User What They Want to Test Present these options: - **View logs** - Watch Jetmon or Veriflier logs in real-time -- **Check worker status** - See worker activity and stats +- **Operator dashboard** - Open http://localhost:8080 in a browser - **Test with sample sites** - Insert test URLs into database -- **Test configuration reload** - Send SIGHUP to master process -- **Test graceful shutdown** - Verify shutdown behavior +- **Test configuration reload** - Send SIGHUP to reload config +- **Test graceful drain** - Verify drain/shutdown behaviour - **Test Veriflier connectivity** - Check Veriflier is responding +- **View audit log** - Query the audit log for a specific blog - **View metrics** - Check StatsD/Graphite dashboard ### 4. Execute Based on Selection **View logs:** ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose logs -f jetmon +docker compose logs -f jetmon ``` -**Check worker status:** +**Check process and stats:** ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon ps auxf -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon cat stats/sitespersec +docker compose exec jetmon ps aux +docker compose exec jetmon cat stats/sitespersec +docker compose exec jetmon cat stats/sitesqueue ``` **Test with sample sites:** First check if table exists and has data: ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec mysqldb mysql -u root -p123456 jetmon_db -e "SELECT COUNT(*) as count FROM jetpack_monitor_sites;" 2>/dev/null +docker compose exec mysqldb mysql -u root -p123456 jetmon_db -e "SELECT COUNT(*) as count FROM jetpack_monitor_sites;" 2>/dev/null ``` If empty or table doesn't exist, offer to create test data per `running-tests.md`. **Test configuration reload:** ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon sh -c 'kill -HUP $(pgrep -f "node lib/jetmon.js" | head -1)' +docker compose exec jetmon ./jetmon2 reload +``` + +**Test drain/graceful shutdown:** +```bash +docker compose exec jetmon ./jetmon2 drain ``` **Test Veriflier connectivity:** ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon curl -k https://veriflier:7801/get/status +docker compose exec jetmon curl http://veriflier:7803/status +``` + +**View audit log:** +```bash +docker compose exec jetmon ./jetmon2 audit --blog-id 1 --since 1h ``` **View metrics:** @@ -70,10 +82,10 @@ Tell user to open http://localhost:8088 and navigate to `Metrics > stats > com > ### 5. Cleanup (if requested) ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose down +docker compose down ``` Or to fully reset with fresh database: ```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose down -v +docker compose down -v ``` diff --git a/.claude/commands/rebuild-addon.md b/.claude/commands/rebuild-addon.md deleted file mode 100644 index 4110d733..00000000 --- a/.claude/commands/rebuild-addon.md +++ /dev/null @@ -1,92 +0,0 @@ -# Rebuild Native Addon - -Rebuild the C++ native addon after making changes to `src/http_checker.cpp` or related C++ files. - -## Instructions - -When the user has modified C++ code and needs to rebuild the native addon, follow these steps: - -### 1. Check What Changed -First, identify what C++ files were modified: -```bash -git -C /Users/rdcoll/Code/a8c/jetmon status --porcelain | grep -E '\.(cpp|h|gyp)$' -``` - -### 2. Determine Build Environment - -Ask the user: **Are you running in Docker or locally?** - -### 3a. Docker Build (Recommended) - -Check if Docker is running: -```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose ps -``` - -If not running, start it: -```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose up -d -``` - -Rebuild and restart inside container: -```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon npm run rebuild-run -``` - -Or if you want to rebuild without auto-running: -```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon sh -c 'node-gyp rebuild && cp build/Release/jetmon.node lib/' -``` - -Then restart Jetmon: -```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose restart jetmon -``` - -### 3b. Local Build - -Run the npm script: -```bash -cd /Users/rdcoll/Code/a8c/jetmon && npm run rebuild-run -``` - -Or manually: -```bash -cd /Users/rdcoll/Code/a8c/jetmon && node-gyp rebuild && cp build/Release/jetmon.node lib/ -``` - -### 4. Verify Build Success - -Check that the new `.node` file was created: -```bash -ls -la /Users/rdcoll/Code/a8c/jetmon/lib/jetmon.node -``` - -### 5. Test the Addon - -Create a quick test to verify the addon loads correctly: -```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon node -e "const c = require('./lib/jetmon.node'); console.log('Addon loaded successfully');" -``` - -Or run a simple HTTP check: -```bash -cd /Users/rdcoll/Code/a8c/jetmon/docker && docker compose exec jetmon node -e " -const checker = require('./lib/jetmon.node'); -checker.http_check('https://wordpress.com', 443, 0, function(idx, rtt, http, err) { - console.log('RTT:', rtt, 'HTTP:', http, 'Error:', err); - process.exit(0); -}); -" -``` - -### 6. Watch for Issues - -If the build fails, common issues include: -- Missing build tools: `node-gyp` requires Python and a C++ compiler -- Node.js version mismatch: Addon must be built for the running Node.js version -- OpenSSL issues: Check that OpenSSL dev headers are available - -If Jetmon crashes after rebuild: -- Check logs: `docker compose logs jetmon` -- Verify the addon API hasn't changed incompatibly diff --git a/.claude/rules/coding-standards.md b/.claude/rules/coding-standards.md index 22f306b0..6a63f7b3 100644 --- a/.claude/rules/coding-standards.md +++ b/.claude/rules/coding-standards.md @@ -5,8 +5,52 @@ Follow coding standards in this order: 1. Existing patterns in the codebase 2. Conventions documented in this file -3. Node.js best practices (for JavaScript) -4. Google C++ Style Guide (for C++, with local modifications) +3. Effective Go (https://go.dev/doc/effective_go) for Go code + +--- + +## Go + +### Formatting +- Run `gofmt` / `goimports` — all Go code must be formatted +- Tabs for indentation (enforced by gofmt) +- Line length: no hard limit; prefer readability over brevity + +### Naming Conventions +- **Packages**: lowercase, single word (e.g., `checker`, `wpcom`, `audit`) +- **Exported identifiers**: `PascalCase` +- **Unexported identifiers**: `camelCase` +- **Acronyms**: all-caps when exported (`HTTPCode`, `RTTMs`, `URL`), lowercase otherwise (`httpCode`) +- **Error variables**: `ErrFoo` for sentinel errors +- **Interfaces**: noun or `-er` suffix (`Checker`, `Client`) +- **Constants**: `PascalCase` for Go constants; config key strings use `SCREAMING_SNAKE_CASE` to match existing JSON keys + +### Error Handling +- Return errors; do not panic in library code +- Wrap with context: `fmt.Errorf("connect: %w", err)` +- Log and continue for non-fatal errors; `log.Fatalf` only at startup + +### Concurrency +- Pass `context.Context` as the first argument to any function that blocks or does I/O +- Use `sync/atomic` for hot-path counters; `sync.Mutex` for struct guards +- Never share mutable state across goroutines without synchronisation +- Prefer buffered channels sized to the expected burst; document the rationale + +### Imports +- Standard library first, then external, then internal — separated by blank lines +- Alias internal `grpc` package as `vgrpc` to avoid collision with `google.golang.org/grpc` +- Alias `"context"` as `stdctx` only when the local scope shadows the package name + +### Comments +- Package comment on every package (`// Package foo ...`) +- Exported symbol comments are required (`// Foo does ...`) +- Inline comments explain *why*, not *what* + +--- + +## Legacy (JavaScript / C++) + +The codebase was previously Node.js + C++. Those conventions are no longer relevant — the Go section above takes full precedence. The sections below are retained only as historical reference and should not be followed for new work. --- @@ -476,4 +520,4 @@ kill -HUP - Documentation standards: `.claude/rules/documentation.md` - Configuration options: `config/config.readme` - Docker setup: `docker/` directory -- Veriflier build: `veriflier/README.md` +- Veriflier binary: `veriflier2/cmd/main.go` diff --git a/.claude/rules/documentation.md b/.claude/rules/documentation.md index a51998d4..b73a869e 100644 --- a/.claude/rules/documentation.md +++ b/.claude/rules/documentation.md @@ -97,7 +97,7 @@ For main README, use this structure with `====` underlines (not `#` headers): 6. Running 7. Database (schema if applicable) -For component READMEs (e.g., `veriflier/README.md`), use minimal format: +For component READMEs (e.g., a future `veriflier2/README.md`), use minimal format: ```markdown component name ============== diff --git a/.claude/rules/general-guidelines.md b/.claude/rules/general-guidelines.md index 161d564f..7cf134f2 100644 --- a/.claude/rules/general-guidelines.md +++ b/.claude/rules/general-guidelines.md @@ -1,6 +1,6 @@ # General Guidelines for Jetmon Development -You are an expert in Node.js, C++, and high-performance systems programming. You have deep expertise in building scalable monitoring services, native Node.js addons, network programming, and multi-process architectures. You prioritize reliability and performance while delivering maintainable solutions for production infrastructure. +You are an expert in Go and high-performance systems programming. You have deep expertise in building scalable monitoring services, concurrent network programming, and production infrastructure. You prioritize reliability and performance while delivering maintainable solutions. ## Short Codes @@ -10,105 +10,115 @@ Check the start of any user message for the following short codes and act approp ## Key Principles -- Write concise, technical code with accurate JavaScript and C++ examples. +- Write idiomatic Go — prefer stdlib, use goroutines and channels correctly. - Follow the established code style conventions (see `coding-standards.md`). -- Use callback-based asynchronous patterns (not Promises/async-await) in JavaScript. +- No Promises/async patterns — Go uses goroutines, channels, and `context.Context` for concurrency. - Prefer modularization over duplication. -- Use descriptive function, variable, and file names following existing conventions: - - JavaScript: `camelCase` for functions, `SCREAMING_SNAKE_CASE` for constants - - C++: `snake_case` for methods, `m_` prefix for member variables +- Use descriptive names following existing conventions: + - Go packages: `lowercase`, single-word when possible + - Exported identifiers: `PascalCase` + - Unexported identifiers: `camelCase` + - Constants: `PascalCase` (Go-idiomatic) or `SCREAMING_SNAKE_CASE` for config keys - Use lowercase with hyphens for new directories. -- Favor IPC messaging for process communication over shared state. +- Pass `context.Context` as the first argument to functions that do I/O or may block. ## Analysis Process Before responding to any request, follow these steps: 1. **Request Analysis** - - Determine if task involves master process, worker process, native addon, or veriflier - Identify which component(s) need modification: - - `lib/jetmon.js` - Master process orchestration - - `lib/httpcheck.js` - Worker process logic - - `src/http_checker.cpp` - Native addon HTTP checking - - `veriflier/` - Geographic verification service + - `cmd/jetmon2/` - Main binary entry point (CLI subcommands, signal handling) + - `internal/orchestrator/` - Round loop, bucket coordination, retry queue + - `internal/checker/` - HTTP check logic (httptrace, SSL, keyword, redirect) + - `internal/checker/pool.go` - Auto-scaling goroutine pool + - `internal/db/` - MySQL queries and migrations + - `internal/config/` - Config loading, validation, hot reload + - `internal/veriflier/` - Veriflier client/server (JSON-over-HTTP; swap for true gRPC after `make generate`) + - `internal/wpcom/` - WPCOM notification client with circuit breaker + - `internal/audit/` - Audit log read/write + - `internal/metrics/` - StatsD UDP client, stats file writer + - `internal/dashboard/` - SSE operator dashboard + - `veriflier2/cmd/` - Standalone veriflier binary - Note compatibility requirements: - - Node.js version (currently v24) - - C++ compiler requirements for native addon - - Qt5 for veriflier builds + - Go 1.22 (uses range-over-integer, builtin `min`/`max`) + - MySQL 8.0 (Docker) / MySQL 5.7+ (production) - Define core functionality and reliability goals - - Consider memory usage implications (worker recycling thresholds) - - Consider observability requirements (StatsD metrics) + - Consider goroutine pool scaling implications + - Consider observability requirements (StatsD metrics, audit log) 2. **Solution Planning** - - Break into process-compatible components - - Identify required IPC message types - - Plan for configuration via `config.json` + - Break into package-compatible components + - Identify required channel/interface contracts + - Plan for configuration via `config/config.json` - Evaluate performance impact: - - Memory usage per worker + - Pool queue depth and goroutine count - Check throughput (sites per second) - Network timeout handling - - Consider horizontal scaling implications (bucket ranges) + - Consider horizontal scaling implications (bucket ranges, heartbeat) 3. **Implementation Strategy** - - Choose appropriate patterns for the target component - - Consider impact on worker lifecycle (memory limits, check counts) - - Plan for graceful error handling and logging - - Ensure metrics are emitted for observability + - Choose appropriate Go patterns for the target component + - Use `context.Context` for cancellation propagation + - Plan for graceful error handling and structured logging + - Ensure StatsD metrics are emitted for significant events - Verify changes work in Docker development environment - - After proposing any code change, always provide specific manual testing steps the user should follow. Jetmon has no automated test suite — manual verification is mandatory for every change. Reference `running-tests.md` for the Docker testing environment. + - After proposing any code change, always provide specific manual testing steps the user should follow. Reference `running-tests.md` for the Docker testing environment. ## Architecture Awareness -### Process Boundaries -- Master process (`jetmon.js`): Orchestration only, no direct HTTP checks -- Worker processes (`httpcheck.js`): Disposable, recycled on limits -- SSL server (`server.js`): Receives veriflier responses only -- Veriflier: Independent Qt application, communicates via HTTPS +### Package Boundaries +- `cmd/jetmon2`: Entry point only; delegates to internal packages +- `internal/orchestrator`: Owns the round loop, retry state, and bucket leases +- `internal/checker`: Stateless HTTP check; no global state +- `internal/checker/pool`: Auto-scaling goroutine pool; driven by queue depth +- `internal/veriflier`: Thin transport layer; JSON-over-HTTP until protoc generates real stubs +- `internal/wpcom`: Owns WPCOM circuit breaker and notification queue ### Data Flow ``` -Database → Master → Workers → C++ Addon → HTTP Checks +Database → Orchestrator → Pool → checker.Check → Results ↓ - Verifliers (geo-distributed) + Veriflier gRPC clients (geo-distributed) ↓ - WordPress.com API + WPCOM API (circuit-broken notification queue) ``` ### Critical Constraints -- Workers must not exceed `WORKER_MAX_MEM_MB` (53MB default) -- Workers recycle after `WORKER_MAX_CHECKS` (10,000 default) -- Retry queues must persist between rounds (not flushed) -- Bucket ranges must not overlap between hosts +- Retry queue must persist between rounds (never flushed at round start) +- Bucket ranges must not overlap between hosts (MySQL `SELECT ... FOR UPDATE` enforces this) +- Heartbeat must fire every round; WatchdogSec=120s means missing two rounds triggers systemd restart +- Circuit breaker floor: at least 1 veriflier quorum, even if all verifliers are offline ## Production Considerations ### Before Modifying Code -- Test changes locally using Docker environment -- Verify memory usage patterns with extended runs +- Test changes locally using Docker environment (`docker compose up -d`) +- Verify goroutine count and memory do not grow unboundedly - Check that StatsD metrics are properly emitted -- Ensure graceful shutdown behavior is preserved +- Ensure graceful shutdown behaviour is preserved (SIGINT → `orch.Stop()`) ### Deployment Process - Changes require Systems team deployment - Create a Systems Request with PR links -- Test in Docker before requesting production deploy +- Run `./jetmon2 validate-config` before deploying ### Performance Sensitivity -- RTT (round-trip time) calculations affect timeout behavior -- Node.js version changes can impact performance characteristics -- Memory leaks compound over time due to long-running processes +- RTT calculations feed into timeout heuristics — don't add unnecessary latency +- Pool auto-scaling fires every 5 seconds; don't block the scale goroutine +- `runtime.ReadMemStats` is stop-the-world; call it infrequently ## Security Considerations -- Authentication tokens in config must not be logged -- SSL certificates are required for veriflier communication -- Database credentials are stored separately in `db-config.conf` +- Auth tokens in config must not be logged +- gRPC/HTTP veriflier auth token is validated per-request in `internal/veriflier/server.go` +- Database credentials are stored in `config/db-config.conf` (not committed) - Never commit secrets to the repository ## Testing Approach +- Use `go test ./...` for unit tests - Use Docker environment for integration testing - Enable `DB_UPDATES_ENABLE` only in local test environments -- Verify worker spawn/death cycle works correctly - Test graceful shutdown with SIGINT -- Monitor memory growth over extended runs +- Monitor goroutine count over extended runs (`/debug/pprof/goroutine`) diff --git a/.claude/rules/running-tests.md b/.claude/rules/running-tests.md index cd19b1ec..6a0223a9 100644 --- a/.claude/rules/running-tests.md +++ b/.claude/rules/running-tests.md @@ -1,6 +1,14 @@ # Running Tests -Jetmon does not have a formal automated test suite. Testing is performed manually using the Docker development environment. +Jetmon 2 has a Go test suite (`go test ./...`) and a Docker development environment for integration testing. + +## Automated Tests + +```bash +make test # go test ./... +make test-race # go test -race ./... +make lint # go vet ./... +``` ## Prerequisites @@ -22,45 +30,29 @@ docker compose down # Stop all services docker compose down -v # Stop and remove volumes (fresh start) ``` -Services started: `mysqldb` (MySQL 5.7), `jetmon` (master + workers), `veriflier`, `statsd` (Graphite) +Services started: `mysqldb` (MySQL 8.0), `jetmon` (single binary), `veriflier`, `statsd` (Graphite) ### View Logs ```bash docker compose logs -f jetmon # Follow Jetmon logs docker compose logs -f veriflier # Follow Veriflier logs -docker compose exec jetmon cat logs/jetmon.log -docker compose exec jetmon cat logs/status-change.log ``` ### Monitor Activity ```bash docker compose exec jetmon cat stats/sitespersec docker compose exec jetmon cat stats/sitesqueue -docker compose exec jetmon ps auxf # Process tree: master, workers, server +docker compose exec jetmon ps aux # Single process — no worker tree ``` ## Test Database Setup -### Create Table +The Docker entrypoint automatically runs `./jetmon2 migrate` on startup. For manual testing, connect to MySQL: + ```bash docker compose exec mysqldb mysql -u root -p123456 jetmon_db ``` -```sql -CREATE TABLE IF NOT EXISTS `jetpack_monitor_sites` ( - `jetpack_monitor_site_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT PRIMARY KEY, - `blog_id` bigint(20) unsigned NOT NULL, - `bucket_no` smallint(2) unsigned NOT NULL, - `monitor_url` varchar(300) NOT NULL, - `monitor_active` tinyint(1) unsigned NOT NULL DEFAULT 1, - `site_status` tinyint(1) unsigned NOT NULL DEFAULT 1, - `last_status_change` timestamp NULL DEFAULT current_timestamp(), - `check_interval` tinyint(1) unsigned NOT NULL DEFAULT 5, - INDEX `blog_id_monitor_url` (`blog_id`, `monitor_url`), - INDEX `bucket_no_monitor_active_check_interval` (`bucket_no`, `monitor_active`, `check_interval`) -); -``` - ### Insert Test Sites ```sql INSERT INTO jetpack_monitor_sites (blog_id, bucket_no, monitor_url, monitor_active, site_status) @@ -88,52 +80,39 @@ Edit `config/config.json`: ### Configuration Reload ```bash -docker compose exec jetmon ps aux | grep jetmon-master # Find PID -docker compose exec jetmon kill -HUP # Reload config +docker compose exec jetmon ./jetmon2 reload # Sends SIGHUP via PID file +# Or manually: +docker compose exec jetmon kill -HUP ``` -### Graceful Shutdown +### Graceful Shutdown / Drain ```bash -docker compose exec jetmon kill -INT # Or: docker compose restart jetmon +docker compose exec jetmon ./jetmon2 drain # Sends SIGINT via PID file +# Or: docker compose stop jetmon ``` -### Veriflier Connectivity +### Validate Config ```bash -docker compose exec jetmon curl -k https://veriflier:7801/get/status -# Should return: OK +docker compose exec jetmon ./jetmon2 validate-config ``` -### Native Addon Rebuild +### Veriflier Connectivity ```bash -docker compose exec jetmon npm run rebuild-run -# Or manually: -docker compose exec jetmon bash -c "node-gyp rebuild && cp build/Release/jetmon.node lib/ && node lib/jetmon.js" +docker compose exec jetmon curl http://veriflier:7803/status +# Should return: {"hostname":"...","version":"...","status":"ok"} ``` -### Test HTTP Checker Directly -Create `lib/test-addon.js`: -```javascript -var checker = require( './jetmon.node' ); -checker.http_check( 'https://wordpress.com', 80, 0, function( index, rtt, http_code, error_code ) { - console.log( 'RTT:', rtt, 'HTTP:', http_code, 'Error:', error_code ); - process.exit( 0 ); -}); -``` -Run: `docker compose exec jetmon node lib/test-addon.js` +### Operator Dashboard +- Open http://localhost:8080 in a browser after starting Docker services. -### Worker Recycling -Set low limits in `config/config.json`: -```json -{ - "WORKER_MAX_CHECKS": 100, - "WORKER_MAX_MEM_MB": 30 -} +### Audit Log +```bash +docker compose exec jetmon ./jetmon2 audit --blog-id 1 --since 1h ``` -Watch: `docker compose logs -f jetmon | grep -E "(spawn|die|recycle|limit)"` ### Memory Monitoring ```bash -docker compose exec jetmon bash -c 'while true; do ps aux --sort=-%mem | head -10; sleep 5; done' +docker compose exec jetmon bash -c 'while true; do ps aux --sort=-%mem | head -5; sleep 5; done' ``` ### StatsD Metrics @@ -159,9 +138,10 @@ Query database: `docker compose exec mysqldb mysql -u root -p123456 jetmon_db -e | Problem | Check | |---------|-------| | Jetmon not starting | `docker compose ps mysqldb`, verify `config/db-config.conf` | -| No sites being checked | Verify `BUCKET_NO_MIN/MAX` matches data, `monitor_active = 1` | -| Veriflier connection fails | `docker compose ps veriflier`, check auth tokens match, SSL certs exist | +| No sites being checked | Verify `BUCKET_TOTAL/TARGET` and that `monitor_active = 1` in DB | +| Veriflier connection fails | `docker compose ps veriflier`, check auth tokens match | | StatsD not receiving | `docker compose exec jetmon ping statsd`, check for UDP errors | +| Migration fails | Check MySQL is up: `docker compose ps mysqldb` | ## Cleanup diff --git a/.claude/skills/create-issue/SKILL.md b/.claude/skills/create-issue/SKILL.md index 195794ff..6ebdd3c1 100644 --- a/.claude/skills/create-issue/SKILL.md +++ b/.claude/skills/create-issue/SKILL.md @@ -66,14 +66,17 @@ Brief description of the issue or need. Include error messages, logs, or metrics ## Affected Component(s) -- [ ] Master Process (`lib/jetmon.js`) -- [ ] Worker Process (`lib/httpcheck.js`) -- [ ] C++ Native Addon (`src/http_checker.cpp`) -- [ ] Veriflier (`veriflier/`) -- [ ] Database (`lib/database.js`) -- [ ] Configuration +- [ ] CLI / Entry Point (`cmd/jetmon2/main.go`) +- [ ] Orchestrator (`internal/orchestrator/`) +- [ ] HTTP Checker (`internal/checker/`) +- [ ] Goroutine Pool (`internal/checker/pool.go`) +- [ ] Database / Migrations (`internal/db/`) +- [ ] Configuration (`internal/config/`) +- [ ] gRPC / Veriflier Transport (`internal/grpc/`) +- [ ] WPCOM Client (`internal/wpcom/`) +- [ ] Operator Dashboard (`internal/dashboard/`) +- [ ] Veriflier Binary (`veriflier2/`) - [ ] Docker/Infrastructure -- [ ] WPCOM Integration ## Steps to Reproduce (if applicable) @@ -116,12 +119,12 @@ Workers are hitting memory limits more frequently than expected... ## Affected Component(s) -- [x] Worker Process (`lib/httpcheck.js`) +- [x] Goroutine Pool (`internal/checker/pool.go`) ## Acceptance Criteria -- [ ] Workers stay under 53MB memory limit -- [ ] No increase in worker recycling frequency +- [ ] Goroutine count stays bounded under sustained load +- [ ] No goroutine leak after pool drain EOF )" ``` diff --git a/.claude/skills/create-pr/SKILL.md b/.claude/skills/create-pr/SKILL.md index a533c7fa..faf8b380 100644 --- a/.claude/skills/create-pr/SKILL.md +++ b/.claude/skills/create-pr/SKILL.md @@ -37,21 +37,26 @@ Create a PR for the current branch, targeting `master`. | Component | Key Files | |-----------|-----------| -| Master Process | `lib/jetmon.js` | -| Worker Process | `lib/httpcheck.js` | -| C++ Native Addon | `src/http_checker.cpp`, `src/http_checker.h`, `binding.gyp` | -| Veriflier | `veriflier/*.cpp`, `veriflier/*.h` | -| Database | `lib/database.js`, `lib/dbpools.js` | -| Configuration | `config/config.json`, `config/config.readme` | +| CLI / Entry Point | `cmd/jetmon2/main.go` | +| Orchestrator | `internal/orchestrator/` | +| HTTP Checker | `internal/checker/checker.go` | +| Goroutine Pool | `internal/checker/pool.go` | +| Database | `internal/db/` | +| Config | `internal/config/config.go`, `config/config.readme` | +| gRPC / Veriflier Transport | `internal/grpc/` | +| WPCOM Client | `internal/wpcom/client.go` | +| Audit Log | `internal/audit/audit.go` | +| Metrics | `internal/metrics/metrics.go` | +| Operator Dashboard | `internal/dashboard/dashboard.go` | +| Veriflier Binary | `veriflier2/cmd/main.go` | | Docker | `docker/docker-compose.yml`, `docker/Dockerfile*` | -| StatsD/Metrics | Look for `statsdClient` calls | -| WPCOM Integration | `lib/wpcom.js`, `lib/comms.js` | +| Migrations | `internal/db/migrations.go`, `migrations/001_jetmon2.sql` | 6. **Determine testing requirements**: - - C++ changes require `npm run rebuild-run` - - Config changes should be tested with Docker environment - - Worker changes should be tested with memory monitoring - - Database changes need schema verification + - Config changes: test with `./jetmon2 validate-config` + - DB/schema changes: test migration with `./jetmon2 migrate` + - All changes: test with Docker environment (`docker compose up --build`) + - Run `make test` to verify unit tests pass 7. **Create the PR** using `gh pr create --draft --assignee @me` with this format: @@ -72,10 +77,11 @@ Brief description of what this PR accomplishes and why. ## Testing -- [ ] Tested locally with Docker environment -- [ ] Ran `npm run rebuild-run` (if C++ changes) -- [ ] Verified memory usage is within limits (if worker changes) -- [ ] Tested configuration reload via SIGHUP (if config changes) +- [ ] Tested locally with Docker environment (`docker compose up --build`) +- [ ] `make test` passes +- [ ] `./jetmon2 validate-config` passes (if config changes) +- [ ] Migration tested with `./jetmon2 migrate` (if schema changes) +- [ ] Tested configuration reload via `./jetmon2 reload` (if config changes) ## Deployment Notes diff --git a/.claude/skills/debug-memory/SKILL.md b/.claude/skills/debug-memory/SKILL.md index 42c54898..e97fee89 100644 --- a/.claude/skills/debug-memory/SKILL.md +++ b/.claude/skills/debug-memory/SKILL.md @@ -1,12 +1,12 @@ --- name: debug-memory -description: Debug memory issues in Jetmon workers and identify leaks -allowed-tools: Bash(docker*), Bash(ps*), Bash(top*), Bash(node*), Read, Glob, Grep +description: Debug memory and goroutine issues in Jetmon 2 +allowed-tools: Bash(docker*), Bash(ps*), Bash(curl*), Bash(go*), Read, Glob, Grep --- # Debug Memory Issues -Use this skill to investigate memory problems in Jetmon workers, identify leaks, and optimize memory usage. +Use this skill to investigate memory growth and goroutine leaks in the Jetmon 2 Go binary. ## Usage @@ -16,224 +16,122 @@ Use this skill to investigate memory problems in Jetmon workers, identify leaks, ## Memory Architecture -### Worker Memory Limits +Jetmon 2 is a single binary with an auto-scaling goroutine pool. Memory pressure does +not cause crashes; the orchestrator drains the pool when memory exceeds `WORKER_MAX_MEM_MB`. -| Setting | Default | Purpose | -|---------|---------|---------| -| `WORKER_MAX_MEM_MB` | 53 | Memory limit before worker recycles | -| `WORKER_MAX_CHECKS` | 10,000 | Check count before worker recycles | - -Workers are designed to be disposable. When hitting limits, they stop accepting work and exit gracefully. - -### Memory Flow - -``` -Worker Process -├── Node.js Heap (V8) -│ ├── HTTP check callbacks -│ ├── Retry queues (arrToRetry) -│ └── Active checks (arrCheck) -├── Native Addon (C++) -│ └── HTTP_Checker instances -└── Buffers (TCP/SSL) -``` +Key memory consumers: +- Goroutine pool (each goroutine ~8KB stack, grows on demand) +- Retry queue (in-memory map, bounded by number of monitored sites) +- WPCOM notification queue (bounded at 1000 entries) +- HTTP response bodies (read up to 1MB for keyword checks) ## Monitoring Commands ### Docker Environment ```bash -# Real-time memory monitoring -docker compose exec jetmon bash -c 'while true; do ps aux --sort=-%mem | head -15; sleep 5; done' - -# Memory usage by process -docker compose exec jetmon ps aux --sort=-%mem +# Real-time process memory (single Go process) +docker compose exec jetmon bash -c 'while true; do ps -o pid,rss,vsz,comm -p $(pgrep jetmon2); sleep 5; done' -# Specific worker memory -docker compose exec jetmon bash -c 'ps -o pid,rss,vsz,comm | grep jetmon' +# Goroutine count and heap via pprof +curl http://localhost:8080/debug/pprof/goroutine?debug=1 | head -30 ``` -### Process Details +### pprof Profiles (via Operator Dashboard) + +The dashboard exposes `/debug/pprof/` endpoints: ```bash -# View process tree -docker compose exec jetmon ps auxf +# Heap profile — shows allocations +curl http://localhost:8080/debug/pprof/heap > heap.prof +go tool pprof heap.prof -# Memory maps (detailed) -docker compose exec jetmon bash -c 'cat /proc/$(pgrep -f jetmon-master)/status | grep -E "Vm|Rss"' +# Goroutine profile — detect leaks +curl http://localhost:8080/debug/pprof/goroutine > goroutine.prof +go tool pprof goroutine.prof + +# CPU profile (30s) +curl "http://localhost:8080/debug/pprof/profile?seconds=30" > cpu.prof +go tool pprof cpu.prof ``` -### StatsD Metrics +### Metrics -Check Graphite (http://localhost:8088) for: -- `stats.workers.*.memory` - Per-worker memory usage -- `stats.workers.recycle.count` - Worker recycling frequency -- `stats.workers.free.count` - Available workers +Check Graphite (http://localhost:8088): +- `stats.goroutines.*` — goroutine count over time +- `stats.memory.*` — heap and RSS metrics (requires `STATSD_SEND_MEM_USAGE: true`) ## Common Memory Issues -### 1. Retry Queue Growth +### 1. Goroutine Leak -**Symptom:** Memory grows steadily, especially during site outages. +**Symptom:** Goroutine count grows unboundedly. **Diagnosis:** ```bash -docker compose exec jetmon cat stats/sitesqueue +curl http://localhost:8080/debug/pprof/goroutine?debug=1 | grep -c "^goroutine" ``` -**Cause:** Large numbers of sites in retry queue (`arrToRetry`). +**Cause:** A goroutine is blocked on a channel that is never read, or a context is never cancelled. -**Solution:** Check retry queue flush logic. Ensure retries are processed, not accumulated. +**Solution:** Check that all goroutines started in `orchestrator.go` and `pool.go` exit +when `ctx.Done()` fires. Ensure `orch.Stop()` is called on shutdown. -### 2. Native Addon Leak +### 2. Retry Queue Growth -**Symptom:** Memory grows even with low check counts. +**Symptom:** Memory grows during extended site outages. **Diagnosis:** ```bash -# Enable debug mode in http_checker.cpp -#define DEBUG_MODE 1 +docker compose exec jetmon ./jetmon2 status +# Check RetryQueueSize in API response +curl http://localhost:8080/api/state | python3 -m json.tool ``` -Watch for: -- Unfreed buffers -- Socket descriptor leaks -- SSL context accumulation - -**Solution:** Review C++ destructor cleanup in `HTTP_Checker::~HTTP_Checker()`. - -### 3. Event Loop Blocking +**Cause:** Retry queue entries accumulate when verifliers are unreachable. -**Symptom:** Workers become unresponsive, memory spikes. +**Solution:** Check veriflier connectivity. Retry queue is expected to hold state for down +sites — it is not a leak, but a design feature. If it grows without bound with no site +outages, check `retryQueue.clear()` is being called in `handleRecovery`. -**Diagnosis:** -```bash -docker compose exec jetmon node --trace-warnings lib/jetmon.js -``` +### 3. HTTP Response Body Accumulation -**Solution:** Ensure async operations complete and callbacks fire. +**Symptom:** Memory spikes correlate with keyword-check sites. -### 4. DNS Resolution Caching +**Cause:** Keyword checks read up to 1MB of response body per check. With many such sites +and a large pool, this can total significant memory. -**Symptom:** Memory grows with unique domains checked. +**Solution:** Reduce `NUM_WORKERS` if memory is constrained. The 1MB cap is hard-coded in +`internal/checker/checker.go`. -**Diagnosis:** Check if `USE_GETADDRINFO` is enabled in http_checker.cpp. - -**Solution:** `getaddrinfo` uses more memory than `gethostbyname`. Consider trade-offs. - -## Memory Profiling - -### Node.js Heap Snapshot - -```javascript -// Add to lib/httpcheck.js for debugging -const v8 = require('v8'); -const fs = require('fs'); - -// Trigger heap snapshot -function dumpHeap() { - const filename = `/tmp/heap-${process.pid}-${Date.now()}.heapsnapshot`; - const stream = fs.createWriteStream(filename); - v8.writeHeapSnapshot(filename); - console.log('Heap snapshot written to:', filename); -} - -// Call when memory is high -if (process.memoryUsage().rss > 50 * 1024 * 1024) { - dumpHeap(); -} -``` - -### Memory Usage Logging - -Add to worker process: - -```javascript -setInterval(function() { - const mem = process.memoryUsage(); - logger.debug('Memory: RSS=' + Math.round(mem.rss / 1024 / 1024) + 'MB, ' + - 'Heap=' + Math.round(mem.heapUsed / 1024 / 1024) + 'MB'); -}, 30000); -``` - -## Reducing Memory Usage - -### Configuration Tuning - -```json -{ - "NUM_WORKERS": 40, // Reduce from 60 if memory constrained - "NUM_TO_PROCESS": 30, // Reduce parallel checks per worker - "WORKER_MAX_MEM_MB": 40, // Lower threshold for faster recycling - "WORKER_MAX_CHECKS": 5000 // Recycle more frequently -} -``` - -### Code Patterns - -**DO:** -```javascript -// Release references when done -arrCheck.splice(index, 1); // Remove processed items - -// Use callbacks, don't hold references -checker.http_check(url, port, index, function(result) { - // Process result immediately - sendResult(result); - // Callback goes out of scope -}); -``` - -**DON'T:** -```javascript -// Accumulate data without bounds -allResults.push(result); // Unbounded growth - -// Hold references longer than needed -var savedChecker = checker; // Prevents GC -``` - -## Testing Memory Fixes - -### Set Low Limits +## Configuration Tuning ```json { - "WORKER_MAX_MEM_MB": 30, - "WORKER_MAX_CHECKS": 100 + "NUM_WORKERS": 40, + "WORKER_MAX_MEM_MB": 200, + "STATSD_SEND_MEM_USAGE": true } ``` -### Monitor Recycling - -```bash -docker compose logs -f jetmon | grep -E "(spawn|die|recycle|memory)" -``` - -### Extended Run Test - -```bash -# Run for extended period, monitor memory growth -docker compose up -d jetmon -watch -n 5 'docker compose exec jetmon ps aux --sort=-%mem | head -10' -``` +- `NUM_WORKERS`: Upper bound on pool goroutines +- `WORKER_MAX_MEM_MB`: Triggers pool drain when Go RSS exceeds this (MB) +- `STATSD_SEND_MEM_USAGE`: Emit `runtime.MemStats` to StatsD each interval -## Key Files for Memory Investigation +## Key Files for Investigation | File | Memory-Related Code | |------|---------------------| -| `lib/httpcheck.js` | Worker arrays: `arrCheck`, `arrToRetry` | -| `lib/jetmon.js` | Master arrays: `arrWorkers`, `gCountSuccess` | -| `src/http_checker.cpp` | Buffer allocation, SSL contexts | -| `lib/config.js` | Memory limit settings | +| `internal/checker/pool.go` | Pool scaling, goroutine lifecycle | +| `internal/orchestrator/orchestrator.go` | Round loop, retry queue, pool drain | +| `internal/orchestrator/retry.go` | Retry queue implementation | +| `internal/wpcom/client.go` | Notification queue (bounded at 1000) | ## Checklist for Memory Issues -- [ ] Check worker recycling frequency in metrics -- [ ] Monitor retry queue size (`stats/sitesqueue`) -- [ ] Review recent code changes affecting arrays -- [ ] Verify C++ cleanup in destructor -- [ ] Test with reduced memory limits -- [ ] Check for unclosed connections/sockets -- [ ] Review setTimeout/setInterval cleanup -- [ ] Confirm process.send() callbacks complete +- [ ] Check goroutine count via pprof (is it growing?) +- [ ] Check retry queue size via `/api/state` +- [ ] Enable `STATSD_SEND_MEM_USAGE` and observe Graphite +- [ ] Capture heap profile before and after a round +- [ ] Verify `orch.Stop()` fully drains the pool on shutdown +- [ ] Check for unbounded channel accumulation in pool.go diff --git a/.claude/skills/docker-test/SKILL.md b/.claude/skills/docker-test/SKILL.md index 7deef78b..f61c449c 100644 --- a/.claude/skills/docker-test/SKILL.md +++ b/.claude/skills/docker-test/SKILL.md @@ -1,12 +1,12 @@ --- name: docker-test -description: Run, debug, and test Jetmon using the Docker development environment +description: Run, debug, and test Jetmon 2 using the Docker development environment allowed-tools: Bash(docker*), Bash(cd docker*), Read, Glob, Grep --- # Docker Testing Skill -Use this skill for running, debugging, and testing Jetmon in the Docker development environment. +Use this skill for running, debugging, and testing Jetmon 2 in the Docker development environment. ## Usage @@ -22,9 +22,9 @@ The docker-compose environment includes: | Service | Port | Purpose | |---------|------|---------| -| `mysqldb` | 3306 | MySQL 5.7 database | -| `jetmon` | 7800 | Main monitoring service | -| `veriflier` | 7801 | Geographic verification | +| `mysqldb` | 3306 | MySQL 8.0 database | +| `jetmon` | 8080 | Jetmon 2 + operator dashboard | +| `veriflier` | 7803 | Geographic verification (gRPC) | | `statsd` | 8125/8088 | Metrics (Graphite UI on 8088) | ## Common Commands @@ -49,7 +49,8 @@ docker compose logs --tail=100 jetmon # Last 100 lines ```bash docker compose ps # Service status -docker compose exec jetmon ps auxf # Process tree inside container +docker compose exec jetmon ps aux # Single process inside container +docker compose exec jetmon ./jetmon2 status # Internal status via API ``` ### Stopping Services @@ -69,43 +70,44 @@ docker compose exec jetmon cat stats/totals docker compose exec jetmon cat stats/sitespersec ``` -### 2. Check Worker Activity +### 2. Open Operator Dashboard -```bash -# View worker stats -docker compose exec jetmon cat stats/sitesqueue - -# Monitor worker memory -docker compose exec jetmon bash -c 'ps aux --sort=-%mem | head -10' -``` +Navigate to http://localhost:8080 in a browser. The dashboard shows: +- Worker/goroutine count +- Retry queue size +- WPCOM circuit breaker state +- Bucket range owned by this host ### 3. Test Configuration Reload ```bash -# Find master process PID -docker compose exec jetmon ps aux | grep jetmon-master - -# Send SIGHUP to reload config -docker compose exec jetmon kill -HUP +docker compose exec jetmon ./jetmon2 reload # Sends SIGHUP via PID file +# Watch logs for "config reloaded" +docker compose logs -f jetmon ``` -### 4. Test Graceful Shutdown +### 4. Test Graceful Drain/Shutdown ```bash -# Send SIGINT for graceful shutdown -docker compose exec jetmon kill -INT +docker compose exec jetmon ./jetmon2 drain # Sends SIGINT via PID file +# Or: +docker compose stop jetmon +``` + +### 5. View Audit Log -# Or restart the container -docker compose restart jetmon +```bash +docker compose exec jetmon ./jetmon2 audit --blog-id 1 --since 1h ``` -### 5. View Status Changes +### 6. Test Veriflier Connectivity ```bash -docker compose exec jetmon tail -f logs/status-change.log +docker compose exec jetmon curl http://veriflier:7803/status +# Should return: {"hostname":"...","version":"...","status":"ok"} ``` -### 6. Check Database +### 7. Check Database ```bash docker compose exec mysqldb mysql -u root -p123456 jetmon_db -e "SELECT COUNT(*) FROM jetpack_monitor_sites;" @@ -150,27 +152,29 @@ Ensure `config/config.json` has: } ``` +### Validate Config Before Restart + +```bash +docker compose exec jetmon ./jetmon2 validate-config +``` + ### Attach to Container ```bash docker compose exec jetmon bash ``` -### Test Native Addon Directly +### Profile Goroutines / Memory (pprof) -Create `lib/test-addon.js`: -```javascript -var checker = require( './jetmon.node' ); +The dashboard exposes pprof at http://localhost:8080/debug/pprof/ -checker.http_check( 'https://wordpress.com', 80, 0, function( index, rtt, http_code, error_code ) { - console.log( 'RTT:', rtt, 'HTTP:', http_code, 'Error:', error_code ); - process.exit( 0 ); -}); -``` - -Run it: ```bash -docker compose exec jetmon node lib/test-addon.js +# Goroutine dump +curl http://localhost:8080/debug/pprof/goroutine?debug=1 + +# Heap profile +curl http://localhost:8080/debug/pprof/heap > heap.prof +go tool pprof heap.prof ``` ### Check Metrics @@ -183,26 +187,25 @@ Open http://localhost:8088 for Graphite UI. Navigate to: ### Jetmon Not Starting - Check database: `docker compose ps mysqldb` -- Verify config: `docker compose exec jetmon cat config/db-config.conf` -- Check for port conflicts on 7800, 7801, 7802 +- Validate config: `docker compose exec jetmon ./jetmon2 validate-config` +- Check migration output: `docker compose logs jetmon | head -30` ### No Sites Being Checked -- Verify sites exist in database -- Check bucket range matches data: `BUCKET_NO_MIN`, `BUCKET_NO_MAX` -- Ensure `monitor_active = 1` for test sites +- Verify sites exist in database with `monitor_active = 1` +- Check bucket ownership: `docker compose exec jetmon ./jetmon2 status` ### Veriflier Connection Failures - Check veriflier is running: `docker compose ps veriflier` -- Test connectivity: `docker compose exec jetmon curl -k https://veriflier:7801/get/status` -- Verify SSL certificates exist in `veriflier/certs/` +- Test connectivity: `docker compose exec jetmon curl http://veriflier:7803/status` +- Verify `VERIFLIER_AUTH_TOKEN` matches in both containers ### Memory Issues ```bash -# Monitor memory over time -docker compose exec jetmon bash -c 'while true; do ps aux --sort=-%mem | head -10; sleep 5; done' +# Monitor goroutine count and memory via pprof +curl http://localhost:8080/debug/pprof/goroutine?debug=1 | head -20 ``` ## Cleanup diff --git a/.claude/skills/jetmon-pre-ship/SKILL.md b/.claude/skills/jetmon-pre-ship/SKILL.md new file mode 100644 index 00000000..cde74808 --- /dev/null +++ b/.claude/skills/jetmon-pre-ship/SKILL.md @@ -0,0 +1,27 @@ +--- +name: jetmon-pre-ship +description: Run jetmon v2 pre-ship checklist before opening a PR +allowed-tools: Bash(go *) Bash(grep *) Bash(git *) +--- + +## Changed files +!`git diff main...HEAD --name-only` + +## Race detector +!`go test -race ./... 2>&1 | tail -30` + +## Known pitfall checks +Retry queue flush (must not happen at round start): +!`grep -rn "RetryQueue\|retryQueue" internal/orchestrator/ | grep -i "flush\|clear\|reset\|= \[\]" || echo "OK"` + +Bucket claim outside transaction (must use SELECT FOR UPDATE): +!`grep -rn "UPDATE jetmon_hosts\|INSERT.*jetmon_hosts" internal/ | grep -v "_test.go" || echo "OK"` + +Non-context DB calls: +!`grep -rn "\.Query\b\|\.QueryRow\b\|\.Exec\b" internal/ | grep -v "Context\|_test.go" || echo "OK"` + +Open maintenance window risk: +!`grep -rn "maintenance_end" internal/ | grep -v "test\|nil\|IsZero" | head -10` + +## Review +Work through each result above. Flag any violation. Then confirm the checklist from AGENTS.md is satisfied. diff --git a/.claude/skills/rebuild-addon/SKILL.md b/.claude/skills/rebuild-addon/SKILL.md deleted file mode 100644 index 8381597d..00000000 --- a/.claude/skills/rebuild-addon/SKILL.md +++ /dev/null @@ -1,189 +0,0 @@ ---- -name: rebuild-addon -description: Rebuild the C++ native addon after making changes to http_checker.cpp -allowed-tools: Bash(npm run*), Bash(node-gyp*), Bash(docker*), Bash(cp*), Bash(ls*), Read, Glob, Grep ---- - -# Rebuild Native Addon - -Use this skill after making changes to the C++ native addon (`src/http_checker.cpp` or `src/http_checker.h`). - -## Usage - -- `/rebuild-addon` - Rebuild the addon and restart Jetmon -- `/rebuild-addon docker` - Rebuild inside Docker container -- `/rebuild-addon test` - Rebuild and run a quick test - -## Quick Reference - -### Using npm Script (Recommended) - -```bash -npm run rebuild-run -``` - -This runs `node-gyp rebuild`, copies the addon to `lib/`, and starts Jetmon. - -### Manual Build - -```bash -node-gyp rebuild -cp build/Release/jetmon.node lib/ -node lib/jetmon.js -``` - -### Docker Build - -```bash -docker compose exec jetmon npm run rebuild-run -``` - -Or manually inside the container: - -```bash -docker compose exec jetmon bash -cd /jetmon -node-gyp rebuild -cp build/Release/jetmon.node lib/ -node lib/jetmon.js -``` - -## Build Verification - -After building, verify the addon loads correctly: - -```bash -node -e "require('./lib/jetmon.node'); console.log('Addon loaded successfully');" -``` - -## Testing the Addon - -### Quick HTTP Check Test - -Create a test script: - -```javascript -// lib/test-addon.js -var checker = require( './jetmon.node' ); - -checker.http_check( 'https://wordpress.com', 80, 0, function( index, rtt, http_code, error_code ) { - console.log( 'Index:', index ); - console.log( 'RTT (microseconds):', rtt ); - console.log( 'HTTP Code:', http_code ); - console.log( 'Error Code:', error_code ); - process.exit( 0 ); -}); -``` - -Run it: -```bash -node lib/test-addon.js -``` - -### Expected Output - -- `index`: The index passed to the check (0 in this case) -- `rtt`: Round-trip time in microseconds -- `http_code`: HTTP response code (200 for success) -- `error_code`: 0 for success, non-zero for errors - -### Error Codes - -| Code | Meaning | -|------|---------| -| 0 | Success | -| 1 | Connection failed | -| 2 | Timeout | -| 3 | SSL error | -| 4 | DNS resolution failed | -| 5 | Too many redirects | - -## C++ Source Files - -| File | Purpose | -|------|---------| -| `src/http_checker.cpp` | Main HTTP checking implementation | -| `src/http_checker.h` | Header with class definition | -| `binding.gyp` | Node-gyp build configuration | - -## Common Issues - -### Build Errors - -**Missing OpenSSL headers:** -``` -fatal error: openssl/ssl.h: No such file or directory -``` -Solution: Install OpenSSL development package: -```bash -# macOS -brew install openssl - -# Ubuntu/Debian -apt-get install libssl-dev -``` - -**Node version mismatch:** -If you see ABI version errors, clean and rebuild: -```bash -node-gyp clean -node-gyp rebuild -``` - -### Runtime Errors - -**Addon not found:** -``` -Error: Cannot find module './jetmon.node' -``` -Solution: Copy the built addon: -```bash -cp build/Release/jetmon.node lib/ -``` - -**Symbol errors:** -Usually indicates Node.js version changed. Rebuild the addon. - -## Debugging C++ Code - -### Enable Debug Output - -In `src/http_checker.cpp`, set: -```cpp -#define DEBUG_MODE 1 -``` - -Debug output goes to stderr. - -### Memory Debugging - -For memory leaks, use Valgrind (Linux): -```bash -valgrind --leak-check=full node lib/jetmon.js -``` - -## Build Configuration - -The `binding.gyp` file configures the build: - -```json -{ - "targets": [{ - "target_name": "jetmon", - "sources": ["src/http_checker.cpp"], - "include_dirs": ["'; + const body = [ + marker, + '### Docker images built for this PR', + '', + `Built from \`${sha}\`. Pull with:`, + '', + '```bash', + `docker pull ghcr.io/automattic/jetmon:${sha}`, + `docker pull ghcr.io/automattic/veriflier:${sha}`, + '```', + '', + 'Images are `linux/amd64` only. On Apple Silicon, add `--platform linux/amd64`. ' + + 'See [docs/docker-images.md](https://github.com/Automattic/jetmon/blob/v2/docs/docker-images.md) for run examples.', + ].join('\n'); + + const comments = await github.paginate(github.rest.issues.listComments, { + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + per_page: 100, + }); + const existing = comments.find(c => c.body && c.body.includes(marker)); + + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body, + }); + } diff --git a/.gitignore b/.gitignore index 01facc04..64c9bcad 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,41 @@ -build/ -node_modules/ +# Compiled binaries +bin/ +/jetmon2 + +# Editor and OS files .DS_Store -lib/jetmon.node +.idea/ +*.swp +*.swo + +# Secrets and local config .env -.idea +config/config.json +config/db-config.conf + +# Generated TLS certificates +certs/*.crt +certs/*.key + +# Generated veriflier runtime config (veriflier-sample.json is tracked) +veriflier2/config/veriflier.json + +# Generated protobuf Go stubs (produced by `make generate`) +*.pb.go + +# Runtime output dirs +docker/volumes/ +logs/*.log +stats/* +!logs/.gitkeep +!stats/.gitkeep + +# Go test coverage output +coverage.out +coverage.html + +# AI tool directories +.codex + +# Local Claude settings (project settings.json is tracked) +.claude/settings.local.json diff --git a/.npmrc b/.npmrc deleted file mode 100644 index 4d936e8e..00000000 --- a/.npmrc +++ /dev/null @@ -1 +0,0 @@ -unsafe-perm=true diff --git a/AGENTS.md b/AGENTS.md index e41411c9..ae7bdd99 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,151 +1,340 @@ # Jetmon Development Guidelines -You are an expert Node.js/C++ developer with extensive knowledge about WordPress and enterprise-level web services. +You are an expert Go developer with extensive knowledge about WordPress, enterprise-level web services, and high-performance network programming. ## Project Overview -Jetmon is a parallel HTTP health monitoring service that monitors Jetpack website uptime at scale. It performs HEAD requests against sites, uses geographically distributed Veriflier services to confirm downtime, and notifies WordPress.com of status changes. +Jetmon is a parallel HTTP uptime monitoring service that checks Jetpack websites at scale. Jetmon 2 is a complete rewrite of the original Node.js + C++ native addon service into a single Go binary. It retains full drop-in compatibility with all external interfaces — MySQL schema, WPCOM API payload, StatsD metric names, and log file format — while dramatically increasing concurrency, reducing memory usage, and eliminating the native addon compilation dependency. + +The Veriflier is rewritten in Go as well, replacing the Qt C++ dependency. JSON-over-HTTP on the configured Veriflier port is the v2 production Monitor-to-Veriflier transport; the proto contract is retained only as a schema reference for a possible future transport. + +See `docs/project.md` for the full project description, feature list, and performance benefit estimates. ## Architecture ``` -Database → Master Process → Worker Pool → C++ HTTP Checks - ↓ - Veriflier Services (geo-distributed) - ↓ - WordPress.com API ← Status Notifications +┌──────────────────────────────────────────────────────────────────────┐ +│ jetmon2 (single binary) │ +│ │ +│ ┌─────────────┐ ┌─────────────┐ ┌──────────────┐ │ +│ │ Orchestrator│ │ Check Pool │ │ Veriflier │ │ +│ │ goroutine │ │ (goroutines)│ │ transport │ │ +│ └──────┬──────┘ └──────┬──────┘ └──────┬───────┘ │ +│ │ │ │ │ +│ ┌──────┴────────────────┴────────────────┴───────┐ │ +│ │ Internal channels │ │ +│ └─────────────────────┬──────────────────────────┘ │ +│ │ │ +│ ┌────────────────────┴────────────────────┐ │ +│ │ eventstore (jetmon_events + │ │ +│ │ jetmon_event_transitions writes) │ │ +│ └────────────────────┬────────────────────┘ │ +│ │ │ +│ ┌────────────┐ ┌────┴────────────┐ ┌──────────────────────┐ │ +│ │ REST API │ │ Webhook │ │ Alerting │ │ +│ │ /api/v1/ │ │ delivery │ │ delivery │ │ +│ │ + auth + │ │ worker │ │ worker │ │ +│ │ ratelimit │ │ (HMAC POST) │ │ (email/PD/Slack/Tm) │ │ +│ └─────┬──────┘ └────────┬────────┘ └──────────┬───────────┘ │ +│ │ │ │ │ +│ ┌─────┴──────┐ ┌──────┴──────────┐ ┌────────┴──────────────┐ │ +│ │ Operator │ │ Webhook │ │ Alert contact │ │ +│ │ dashboard │ │ receivers │ │ destinations │ │ +│ │ (SSE) │ │ (HTTPS) │ │ (HTTPS / SMTP / API) │ │ +│ └────────────┘ └─────────────────┘ └───────────────────────┘ │ +└────────────┬──────────────────────────┬──────────────────────────────┘ + │ │ + MySQL WPCOM API + StatsD (legacy notification path, + Log files still active alongside + alert contacts) ``` -**Master Process** (`lib/jetmon.js`): Spawns workers, fetches site batches from database every 5 seconds, distributes work, and notifies WordPress.com of status changes. +**Orchestrator goroutine** (`internal/orchestrator/`): Fetches site batches from MySQL, dispatches work to the check pool via channels, processes results, manages the local retry queue, coordinates Veriflier confirmation requests, and emits WPCOM legacy notifications. Owns all DB access for site state and writes events through `eventstore`. -**Worker Processes** (`lib/httpcheck.js`): Forked child processes that perform HTTP checks via C++ native addon. Workers recycle when reaching memory limit (53MB) or check count (10,000). +**Check Pool** (`internal/checker/`): A bounded goroutine pool that performs HTTP checks using Go's `net/http` and `net/http/httptrace`. Records DNS, TCP connect, TLS handshake, and TTFB timings for every check. Pool size auto-scales against queue depth within configured min/max bounds. -**C++ Native Addon** (`src/http_checker.cpp`): High-performance HTTP checking with HEAD requests, 60-second timeout, OpenSSL support, and redirect handling. +**Eventstore** (`internal/eventstore/`): The single writer for `jetmon_events` and `jetmon_event_transitions`. Every status / severity / state change is written transactionally so the event row's projection and the transition log can never disagree. Both downstream workers (webhooks, alerting) consume `jetmon_event_transitions` via a high-water mark. -**Veriflier Services** (`veriflier/`): C++/Qt applications deployed globally to verify downtime before status changes are reported. +**REST API** (`internal/api/`): The internal API surface (`/api/v1/...`) used by the gateway, alerting workers, dashboards, and CI tooling. Per-consumer Bearer-token auth (`internal/apikeys/`), per-key rate limiting, Stripe-style idempotency keys on POSTs. Sites CRUD, events list / single / transitions, SLA stats, webhooks CRUD, alert-contacts CRUD, manual delivery retry. -## Build and Run Commands +**Webhook delivery worker** (`internal/webhooks/`): Polls `jetmon_event_transitions`, matches each new transition against active webhooks (event-type + site + state filters), and POSTs HMAC-signed payloads to consumer URLs. Retry ladder 1m / 5m / 30m / 1h / 6h then abandon. Per-webhook in-flight cap and shared dispatch pool. -```bash -# Docker development (recommended) -cd docker && docker compose up -d # Start all services -docker compose down # Stop services +**Alerting delivery worker** (`internal/alerting/`): Same shape as the webhook worker but for managed channels — email (via `wpcom`/`smtp`/`stub` senders), PagerDuty Events API v2, Slack incoming webhooks, Microsoft Teams. Filter is simpler (`site_filter` + `min_severity`); per-contact `max_per_hour` rate cap absorbs pager storms. Send-test endpoint exercises the same dispatch path without requiring a real event. -# Manual build and run -npm install -node-gyp rebuild -cp build/Release/jetmon.node lib/ -node lib/jetmon.js +**Current delivery-owner constraint:** In the single-binary v2 deployment, `API_PORT > 0` starts the API server and makes webhook / alert-contact delivery workers eligible to run. Delivery rows are claimed transactionally, so multiple active delivery workers do not claim the same pending row. Use `DELIVERY_OWNER_HOST` as a rollout guard when intentionally keeping delivery single-owner during migration from embedded to standalone delivery. -# Rebuild and run (npm script) -npm run rebuild-run -``` +**Veriflier transport** (`internal/veriflier/`): JSON-over-HTTP client/server for Monitor↔Veriflier communication. Replaces the previous SSL server and custom HTTPS protocol. This is the v2 production transport. -## Configuration +**Veriflier** (`veriflier2/`): Standalone Go binary deployed at remote locations. Receives check batches from the Monitor, performs HTTP checks, and returns results. Replaces the Qt C++ Veriflier. -Copy `config/config-sample.json` to `config/config.json`. Key settings: +**Future shape:** the API server, webhook worker, and alerting worker are independently scalable concerns and the natural target for the multi-binary split tracked in `docs/roadmap.md`. Today they coexist in `jetmon2` and the MySQL schema is the bus between them; tomorrow the deliverer becomes its own binary handling all outbound dispatch (webhooks + alerting + WPCOM legacy migrated behind it). -- `NUM_WORKERS`: Worker process count (default 60) -- `NUM_TO_PROCESS`: Parallel checks per worker (default 40) -- `BUCKET_NO_MIN/MAX`: Database bucket range for horizontal scaling (0-511 total) -- `MIN_TIME_BETWEEN_ROUNDS_SEC`: Check interval (300 seconds default) -- `PEER_OFFLINE_LIMIT`: Verifliers required to confirm downtime (3) +## Key Files -**Variable Check Intervals:** Sites can be configured for 1-5 minute check intervals via the `check_interval` database field. The default is 5 minutes. One-minute intervals require sufficient host capacity. +| Path | Purpose | +|------|---------| +| `cmd/jetmon2/main.go` | Binary entry point, signal handling, startup | +| `internal/orchestrator/` | Round scheduling, DB fetch, work dispatch, WPCOM notifications | +| `internal/checker/` | Goroutine pool, HTTP checks, httptrace timing | +| `internal/veriflier/` | JSON-over-HTTP client/server for Veriflier communication | +| `internal/db/` | MySQL access, `jetmon_hosts` heartbeat, connection pooling | +| `internal/config/` | Config loading, SIGHUP hot-reload | +| `internal/metrics/` | StatsD client, stats file writer | +| `internal/wpcom/` | WPCOM API client, circuit breaker | +| `internal/audit/` | Operational log writes to `jetmon_audit_log` (WPCOM, retries, verifier RPCs, config reloads) | +| `internal/eventstore/` | Event-sourced site state — manages `jetmon_events` + `jetmon_event_transitions` writes in single transactions | +| `internal/api/` | Internal REST API server (`/api/v1/...`) — auth, rate limiting, idempotency, sites/events/SLA/webhooks/alert-contacts handlers | +| `internal/apikeys/` | API key registry, sha256-hashed at rest; `./jetmon2 keys` CLI | +| `internal/webhooks/` | Webhook registry + delivery worker — outbound HMAC-signed POSTs of event transitions, retry ladder 1m/5m/30m/1h/6h | +| `internal/alerting/` | Alert contact registry + delivery worker — managed channels (email/PagerDuty/Slack/Teams) with site_filter + severity gate + per-hour rate cap | +| `internal/dashboard/` | Operator dashboard, SSE handler | +| `veriflier2/` | Go Veriflier binary | +| `docs/internal-api-reference.md` | Internal REST API reference (auth, all endpoints, payload shapes) | +| `docs/roadmap.md` | Deferred features and architectural roadmap (multi-binary split, public-API path) | +| `docs/adr/` | Architecture Decision Records — load-bearing decisions ("why is X like this") with context, decision, and consequences | +| `docs/project.md` | Full project description and feature specification | + +## Build and Run -See `config/config.readme` for detailed documentation of all options. +```bash +# Docker development (recommended) +cd docker && docker compose up -d # Start all services +docker compose up --build # Rebuild binary and start +docker compose down # Stop services +docker compose down -v # Stop and remove volumes (fresh start) + +# Build binaries directly +make all + +# Use a non-default Go binary when needed +make GO=/path/to/go all + +# Run tests +make test +make test-race +make lint + +# Run with race detector +go run -race ./cmd/jetmon2/ + +# Validate config +./jetmon2 validate-config + +# CLI subcommands +./jetmon2 version +./jetmon2 migrate +./jetmon2 status +./jetmon2 audit --blog-id 12345 --since 2h +./jetmon2 rollout guided +./jetmon2 rollout host-preflight +./jetmon2 rollout pinned-check +./jetmon2 rollout cutover-check +./jetmon2 rollout dynamic-check +./jetmon2 rollout projection-drift +./jetmon2 rollout state-report +./jetmon2 site-tenants import --file site-tenants.csv --dry-run +./jetmon2 drain +./jetmon2 reload +``` -## Key Files +## Configuration -| File | Purpose | -|------|---------| -| `lib/jetmon.js` | Master process orchestration | -| `lib/httpcheck.js` | Worker process HTTP checking | -| `lib/database.js` | MySQL queries and connection | -| `lib/comms.js` | HTTPS communication with Verifliers | -| `lib/wpcom.js` | WordPress.com API notifications | -| `lib/server.js` | SSL server for Veriflier responses | -| `lib/statsd.js` | StatsD metrics client | -| `src/http_checker.cpp` | C++ native addon for HTTP checks | -| `binding.gyp` | Node-gyp build configuration | +Copy `config/config-sample.json` to `config/config.json`. All keys from the original Jetmon are honoured; new keys are additive. Send SIGHUP to hot-reload config without restarting. + +**Existing keys (unchanged behaviour):** +- `NUM_WORKERS`: Goroutine pool size (replaces worker process count) +- `NUM_TO_PROCESS`: Legacy compatibility setting retained so copied v1-style configs parse; it does not cap Go scheduler throughput +- `DATASET_SIZE`: Database fetch page size for scheduler work; the scheduler continues fetching pages until due work is drained +- `MIN_TIME_BETWEEN_ROUNDS_SEC`: Fixed-cadence full-fleet pass interval when `USE_VARIABLE_CHECK_INTERVALS` is false +- `NET_COMMS_TIMEOUT`: Default per-check HTTP timeout in seconds +- `PEER_OFFLINE_LIMIT`: Veriflier agreements required to confirm downtime +- `WORKER_MAX_MEM_MB`: Go runtime memory threshold that triggers worker-pool drain (replaces worker recycling) + +**New keys:** +- `BUCKET_TOTAL`: Total bucket range (e.g. 1000); replaces static `BUCKET_NO_MIN/MAX` +- `BUCKET_TARGET`: Maximum buckets this host should own +- `BUCKET_HEARTBEAT_GRACE_SEC`: Seconds before an unresponsive host's buckets are reclaimed (suggested: 2× round time) +- `PINNED_BUCKET_MIN/MAX`: Migration-only static bucket range for replacing one v1 host with one v2 host; disables `jetmon_hosts` dynamic ownership while set. Legacy `BUCKET_NO_MIN/MAX` are accepted as aliases for this mode. +- `ALERT_COOLDOWN_MINUTES`: Default cooldown between repeated alerts for the same site +- `LEGACY_STATUS_PROJECTION_ENABLE`: Keep v1 `site_status` / `last_status_change` projection updated during shadow-v2-state migration +- `LOG_FORMAT`: `text` (default, drop-in compatible) or `json` (structured logging) +- `USE_VARIABLE_CHECK_INTERVALS`: Respect per-site `check_interval`; the scheduler uses a short idle poll and maintained `jetmon_site_runtime.next_check_at` timestamps control which sites are ready in legacy round-scheduler mode +- `DASHBOARD_PORT`: Internal port for the operator dashboard (0 to disable) +- `DEBUG_PORT`: localhost-only pprof port, default 6060 (0 to disable; never exposed remotely) + +See `config/config.readme` for the full option reference. + +## Drop-in Compatibility Requirements + +These interfaces must remain identical to the original Jetmon. Do not change them without explicit discussion: + +| Interface | Constraint | +|-----------|-----------| +| MySQL schema | Read same columns; additive migrations only | +| WPCOM notification payload | Same JSON structure and field names | +| StatsD metric names | Same dotted paths; new metrics may be added | +| Log file paths and format | `logs/jetmon.log`, `logs/status-change.log` | +| `stats/` file outputs | `sitespersec`, `sitesqueue`, `totals` — same format | +| `config/config.json` keys | All existing keys honoured | +| SIGHUP config reload | Same behaviour | +| SIGINT graceful shutdown | Same behaviour | ## Site Status Values -- `0` SITE_DOWN: Local checks failed +- `0` SITE_DOWN: Local checks failed, retry/verification in progress - `1` SITE_RUNNING: Confirmed online -- `2` SITE_CONFIRMED_DOWN: Verified down by Verifliers +- `2` SITE_CONFIRMED_DOWN: Verified down by Verifliers, WPCOM notified -## Monitoring Behavior +## Monitoring Behaviour **Check Process:** -- Initial timeout: 10 seconds -- Verification timeout: 20 seconds (on retry from different locations) -- Max redirects: 3 (beyond this triggers "redirect" error) -- HTTP response code < 400 is considered success -- User Agent: `jetmon/1.0 (Jetpack Site Uptime Monitor by WordPress.com)` +- Default timeout: `NET_COMMS_TIMEOUT` seconds (configurable per site via `jetmon_site_check_config.timeout_seconds`) +- HTTP response code < 400 is success +- Redirect policy configurable per site: `follow` (default), `alert` (warn on chain change), `fail` +- Max redirects when following: 10 +- Keyword check: if `check_keyword` is set, GET the body and confirm the string is present +- User-Agent: `jetmon/2.0 (Jetpack Site Uptime Monitor by WordPress.com)` +- Per-site custom headers merged from `jetmon_site_check_config.custom_headers` + +**Timing Breakdown (via `net/http/httptrace`):** +Every check records composite RTT plus DNS lookup, TCP connect, TLS handshake, and first response byte (TTFB) timings. These samples are stored in `jetmon_check_history` for trending and API statistics. Scheduler-level StatsD metrics expose phase timing and write volume so capacity tests can separate check execution, freshness writes, check-history inserts, SSL expiry updates, and event handling. + +**SSL Monitoring:** +Every HTTPS check inspects `tls.ConnectionState` for: +- Certificate `NotAfter` — alerts at 30, 14, and 7 days before expiry +- TLS version — flags TLS 1.0/1.1 as deprecated +- Cipher suite — recorded in audit log **Downtime Verification:** -When a site appears down, Jetmon retries from the same location twice, then verifies from 2 other locations on different continents via Verifliers before confirming downtime. - -**Status Change Email Types:** -- `server`: 5xx response (internal/fatal error) -- `blocked`: 403 response (monitoring blocked) -- `client`: 4xx response other than 403 (auth/DNS issues) -- `https`: SSL certificate problems -- `intermittent`: Request timeout (>10 seconds but site may load) -- `redirect`: Too many redirects (>3) -- `success`: Normal response (used in "site is back up" emails) +1. Local check fails → open a `Seems Down` event (severity 3) and enter the local retry queue. The event opens on the **first** failure so `started_at` reflects the actual incident start. Subsequent failures during retry are no-ops on the events table (idempotent dedup). +2. After `NUM_OF_CHECKS` local failures → dispatch to Verifliers (event stays Seems Down) +3. `PEER_OFFLINE_LIMIT` Veriflier agreements required to confirm +4. Verifier outcomes: + - **Confirms** → Promote event to `Down` (severity 4) with `reason = verifier_confirmed`. WPCOM notification via same payload as original. + - **Disagrees** → Close event with `resolution_reason = false_alarm`. +5. Recovery (any successful probe while an event is open): + - From `Seems Down` → close with `resolution_reason = probe_cleared`. + - From `Down` → close with `resolution_reason = verifier_cleared` and send recovery notification. + +Shadow-v2-state migration keeps incidents authoritative in `jetmon_events` + `jetmon_event_transitions` while `jetpack_monitor_sites` remains the v1-owned site identity/cadence/projection table. V2-only check config lives in `jetmon_site_check_config`. When `LEGACY_STATUS_PROJECTION_ENABLE` is true, the `jetpack_monitor_sites.site_status` / `last_status_change` projection is updated in the same transaction as every event mutation (no drift). v1 mapping: open Seems Down → `site_status = SITE_DOWN (0)`; promoted to Down → `site_status = SITE_CONFIRMED_DOWN (2)`; closed → `site_status = SITE_RUNNING (1)`. After legacy readers move to the v2 API/event tables, this projection can be disabled. + +**Alert Deduplication:** +After an alert fires, subsequent alerts for the same site are suppressed for the global `ALERT_COOLDOWN_MINUTES` value or `jetmon_site_check_config.alert_cooldown_minutes`. Suppression is recorded in the audit log. + +**Status Change Types (unchanged):** +- `server`: 5xx response +- `blocked`: 403 response +- `client`: 4xx other than 403 +- `https`: SSL/TLS problems +- `intermittent`: Request timeout +- `redirect`: Redirect policy failure +- `success`: Site recovered ## Database Schema -Sites are stored in `jetpack_monitor_sites` with bucket-based sharding. The `bucket_no` field (0-511) enables horizontal scaling across multiple Jetmon instances. +Sites are stored in the v1-shaped `jetpack_monitor_sites` table with +bucket-based sharding. The `bucket_no` field enables horizontal scaling. Jetmon +v2 keeps v2-only site config and runtime state out of that legacy table: rich +probe config lives in `jetmon_site_check_config`, and freshness / SSL +observation state lives in `jetmon_site_runtime`. During rollout, v2 writes +only the v1 compatibility projection fields `site_status` and +`last_status_change` back to `jetpack_monitor_sites`. -## Metrics +New tables introduced by Jetmon 2: -StatsD metrics are sent with prefix `com.jetpack.jetmon.`. Key metrics include worker lifecycle events, queue sizes, database timing, and memory usage. +| Table | Purpose | +|-------|---------| +| `jetmon_hosts` | MySQL-coordinated bucket ownership and heartbeat | +| `jetmon_events` | Current state of every incident — one row per `(blog_id, endpoint_id, check_type, discriminator)` while open; mutable until `ended_at` is set, then frozen | +| `jetmon_event_transitions` | Append-only history of every mutation to `jetmon_events` (open, severity change, state change, cause link, close) | +| `jetmon_audit_log` | Operational trail — WPCOM notifications, retry dispatch, verifier RPCs, alert/maintenance suppression, config reloads. Site-state changes do **not** flow through here | +| `jetmon_check_history` | RTT and timing samples for trending | +| `jetmon_site_check_config` | V2-only per-site check policy/config: HEAD/GET mode, detection profile, keywords, maintenance windows, headers, timeout, redirect policy, cooldown | +| `jetmon_site_runtime` | V2-only runtime freshness and observation projection: last checked, next check, last alert, SSL expiry | +| `jetmon_false_positives` | Veriflier non-confirmation events | -**Grafana Dashboard:** Production metrics are visualized in the Jetmon Health Dashboard using Graphite as the StatsD backend. The dashboard tracks free/active workers, sites processed, round times, and memory usage. +## Multi-Host Bucket Coordination -**StatsD Configuration Notes:** -- Flush interval: 5 seconds (`STATS_UPDATE_INTERVAL_MS`) -- Graphite retention: 10s:6h, 1m:7d, 10m:5y -- Counter metrics use `sum` aggregation; gauges use `average` +Jetmon 2 normally replaces static `BUCKET_NO_MIN/MAX` config with runtime bucket ownership via the `jetmon_hosts` table. On startup, each instance claims unclaimed or expired bucket ranges using `SELECT ... FOR UPDATE` transactions. A heartbeat query runs each round; hosts with stale heartbeats (older than `BUCKET_HEARTBEAT_GRACE_SEC`) have their buckets absorbed by surviving peers. On SIGINT, the instance releases its buckets immediately. During the initial v1-to-v2 migration only, `PINNED_BUCKET_MIN/MAX` (or legacy `BUCKET_NO_MIN/MAX`) can pin one v2 host to its v1 predecessor's exact bucket range and disables `jetmon_hosts` ownership for that host. -## WPCOM Integration +This enables zero-config horizontal scaling (spin up a host, it claims buckets) and self-healing coverage (a failed host's buckets are absorbed within one grace period) without a cluster orchestrator. + +## Metrics -**Jetmon Endpoint:** WPCOM receives status change notifications from Jetmon and triggers the `jetpack_monitor_site_status_change` hook for consumers (notifications, Activity Log, etc.). +StatsD metrics retain the same prefix and dotted path format as Jetmon 1: `com.jetpack.jetmon.`. New metrics added by Jetmon 2 follow the same naming convention and are additive. -**Email Notification Options (stored on WPCOM):** -- `jetpack_monitor_notifications_users_ids`: WPCOM user IDs to notify -- `jetpack_monitor_notify_email_addresses`: Additional email addresses +StatsD is the primary metrics transport. No Prometheus endpoint is provided. -**REST API Endpoints:** -- `GET /sites/{site}/jetpack-monitor-status`: Current monitoring status -- `GET /sites/{site}/jetpack-monitor-incidents`: Historical incidents -- `GET/POST /sites/{site}/jetpack-monitor-settings`: Monitor configuration +## WPCOM Integration + +Jetmon notifies WPCOM of status changes via the same JSON payload format as Jetmon 1. The `jetpack_monitor_site_status_change` hook on WPCOM is triggered for consumers (notifications, Activity Log, etc.). A circuit breaker protects against WPCOM API failures: after N consecutive failures the circuit opens, pending notifications are queued in memory, and retries are attempted on a backoff schedule. ## Production Deployment -Jetmon runs on 6 production hosts managed by the Systems team. To deploy changes: -1. Test changes locally using Docker environment -2. Create a Systems Request with PR links for review -3. Systems team deploys to production hosts +Jetmon runs on production hosts managed by the Systems team. To deploy changes: +1. Test locally using the Docker environment (`go test ./...`, manual Docker verification) +2. Create a PR and request a Systems Request with PR links +3. Systems team performs a rolling update: one host at a time, SIGINT → drain → deploy binary → restart +4. Surviving hosts absorb the draining host's buckets during each update window + +Rolling updates require no simultaneous restart of all hosts and leave no sites unchecked during the update. + +## Architectural Decisions — Event and State Model + +These decisions govern how Jetmon models site state. They must be maintained consistently across all changes. Full design rationale is in [`docs/taxonomy.md`](docs/taxonomy.md) (Parts 2–3) and [`docs/events.md`](docs/events.md). -## Worker Lifecycle +**Events are the source of truth.** Site status is event-sourced across two tables: `jetmon_events` (one row per incident, holding the current severity/state/metadata) and `jetmon_event_transitions` (append-only history of every mutation). The site row stores a denormalized projection for read performance. Update events, transitions, and the projection in the same transaction — they must not drift. If the projection is ever suspect, rebuild it from the events tables. + +**Every event mutation writes a transition row in the same transaction.** Open, severity bump, state change, cause-link change, close — no carve-outs. The `eventstore` package is the only writer for `jetmon_events` and `jetmon_event_transitions`; external callers must go through it. This keeps the invariant testable with one integration test surface. + +**Severity and state are separate fields.** Severity is numeric — use it for ordering, thresholds, and rollup. State is a human-readable label — use it for display and lifecycle transitions. A live event's severity can be updated in place without changing its state (a worsening degradation is not a new kind of problem). + +**"Seems Down" is a first-class lifecycle state.** Between first probe failure and verifier confirmation, a site is Seems Down. It is not an implementation detail — dashboards show it, alert rules can key off it. The lifecycle is: +``` +Up → Seems Down → Down → Resolved + ↓ + Up (false alarm) +``` -Workers exit and are respawned when: -- Memory exceeds `WORKER_MAX_MEM_MB` (53MB default) -- Check count exceeds `WORKER_MAX_CHECKS` (10,000 default) -- Process receives termination signal +**Events update in place on severity change.** When a Seems Down event is verifier-confirmed to Down, update the same event row — do not close and open a new one. The event's `started_at` stays at first-failure time. Incident duration is honest: it starts from first failure, not from confirmation. -The master process tracks worker states and gracefully handles recycling. +**Event identity is idempotent.** The same underlying failure must not produce duplicate events. Deduplication lives in the shared probe runner, not in individual check types. Key events by `(site_id, endpoint_id, check_type, [discriminator])` so repeated detection of the same condition updates the existing open event. + +**Resolution reason is required on close.** When an event closes, record why: `verifier_cleared`, `false_alarm`, `manual_override`, `auto_timeout`. Don't just set `end_timestamp` — capture the cause. This affects uptime calculations and report accuracy. + +**Causal links are separate from hierarchical rollup.** An endpoint event rolling up to site level is a hierarchy relationship. A Layer-3 event caused by a Layer-1 failure is a causal relationship. Store these in separate structures. Conflating them creates bugs where dismissing a cause accidentally dismisses a rollup. + +**Unknown is not downtime.** If the probe crashes, a region loses network, or the Jetpack agent stops reporting, the result is Unknown — not Down. Monitor-side failures must never be reported as customer-site downtime. ## Known Pitfalls -**Retry Queue Persistence:** Retry queues must persist between rounds. Flushing queues at round start prevents sites from being confirmed as down, since the 1-minute recheck cannot complete before the next round. +**Retry Queue Persistence:** The local retry queue must persist between rounds. Do not flush it at round start — a site must accumulate `NUM_OF_CHECKS` failures before Veriflier escalation, and flushing resets that counter, preventing downtime confirmation. + +**Bucket Claiming Races:** When dynamic ownership is active, the `SELECT ... FOR UPDATE` transaction on `jetmon_hosts` is the only safe way to claim buckets. Do not claim buckets outside a transaction — two hosts starting simultaneously will both see the same unclaimed range and must not both write it. Pinned v1-to-v2 migration hosts intentionally do not claim buckets in `jetmon_hosts`. + +**Circuit Breaker Floor:** The WPCOM API circuit breaker queue is bounded. If the queue fills, the oldest pending notifications are dropped with an error log. Monitor the circuit breaker state in the operator dashboard during any WPCOM API incident. + +**Veriflier Quorum Floor:** When Verifliers are marked unhealthy and excluded, `PEER_OFFLINE_LIMIT` adjusts dynamically, but there is a configured floor to prevent a single healthy Veriflier from confirming downtime alone. Ensure the floor is set appropriately for the number of deployed Verifliers. + +**Delivery Ownership During Rollout:** Webhook and alert-contact workers claim delivery rows transactionally. Use `DELIVERY_OWNER_HOST` when you want to keep only one delivery owner active per database cluster during migration from embedded `jetmon2` delivery to standalone `jetmon-deliverer`. + +**Maintenance Windows:** Checks continue during a maintenance window and data is recorded in the audit log, but no alerts fire. Verify that `maintenance_end` is correctly set — an open-ended maintenance window silently suppresses all alerts for that site indefinitely. + +**Memory Pressure Drain:** If RSS exceeds the configured threshold, the goroutine pool shrinks by 10% via graceful drain. This reduces throughput temporarily. If memory pressure is sustained, investigate for goroutine leaks using the pprof endpoint at `http://localhost:/debug/pprof/` (localhost only) before increasing `WORKER_MAX_MEM_MB`. -**Bucket Configuration:** The `BUCKET_NO_MIN/MAX` configuration must not overlap between hosts. A past misconfiguration caused hosts to process only half their intended sites, masking performance issues. +## Agent Workflow Notes -**Node Version Sensitivity:** RTT (round-trip time) calculations can vary between Node.js versions. Version changes should be tested thoroughly as they can affect timeout behaviors. +These notes are for Codex and other coding agents working for Chris. -**Memory Pressure:** When checking more sites (due to shorter intervals or configuration fixes), memory usage increases. Monitor memory metrics and consider scaling hosts horizontally if workers frequently hit memory limits. +- If uptime-bench or Jetmon capacity tests are running, do not change deployed + services, support hosts, databases, provider state, fleet config, or runtime + config without explicit permission. +- When a request could touch both `jetmon` and `uptime-bench`, state the repo + path before acting. Treat "this repo" as ambiguous when multiple agents or + worktrees are active. +- Prefer local analysis, agent files, branch inspection, code review, and + handoff preparation while tests are active. +- Project-local agent playbooks live under `.agents/skills`. +- For uptime-bench-specific report or fleet rules, also read + `/home/gaarai/code/uptime-bench/AGENTS.md`. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..3f60c7a1 --- /dev/null +++ b/Makefile @@ -0,0 +1,155 @@ +BINARY := bin/jetmon2 +DELIVERER := bin/jetmon-deliverer +VERIFLIER := bin/veriflier2 +API_SMOKE_BATCH ?= local-smoke +API_SMOKE_ARGS ?= +API_VALIDATE_BATCH ?= api-cli-validate +API_VALIDATE_COUNT ?= 1 +API_VALIDATE_MODE ?= http-500 +API_VALIDATE_WAIT ?= 30s +API_VALIDATE_WEBHOOK_WAIT ?= 60s +API_VALIDATE_SKIP_WEBHOOK ?= 0 +API_VALIDATE_SKIP_FAILURE ?= 0 +DOCKER_COMPOSE ?= docker compose -f docker/docker-compose.yml +API_CLI_TOKEN_CONSUMER ?= api-cli +API_CLI_TOKEN_SCOPE ?= admin +API_CLI_TOKEN_CREATED_BY ?= docker-local +API_CLI_TOKEN_TTL ?= 0 +API_CLI_TOKEN_ID ?= +ROLLOUT_VM_LAB_HOST ?= jetmon-vm-host-1 +ROLLOUT_VM_LAB_SSH ?= ssh -F $(HOME)/.ssh/config -o ControlMaster=no -o ControlPath=none -o BatchMode=yes -o ConnectTimeout=10 +ROLLOUT_VM_LAB_SNAPSHOT ?= pre-guided-flow +GO ?= $(shell if command -v go >/dev/null 2>&1; then command -v go; elif [ -x /usr/local/go/bin/go ]; then printf /usr/local/go/bin/go; else printf go; fi) +GOCACHE ?= /tmp/jetmon-go-cache +GOMODCACHE ?= /tmp/jetmon-gomod-cache +GO_ENV := GOCACHE=$(GOCACHE) GOMODCACHE=$(GOMODCACHE) +BUILD_FLAGS := -ldflags "-X main.version=$(shell git describe --tags --always --dirty) \ + -X main.buildDate=$(shell date -u +%Y-%m-%dT%H:%M:%SZ) \ + -X main.goVersion=$(shell $(GO) version | awk '{print $$3}')" + +.PHONY: all build build-deliverer build-veriflier generate test test-race test-veriflier-soak lint vet rollout-docs-verify rollout-rehearsal-verify rollout-vm-lab-sync rollout-vm-lab-sync-artifacts rollout-vm-lab-stage-v2 rollout-vm-lab-doctor rollout-vm-lab-prepare rollout-vm-lab-smoke rollout-vm-lab-execute-smoke rollout-vm-lab-failure-smoke rollout-vm-lab-resume-smoke rollout-vm-lab-post-start-rollback-smoke rollout-vm-lab-bad-ssh-smoke rollout-vm-lab-v2-start-failure-smoke rollout-vm-lab-runtime-guard-smoke rollout-vm-lab-real-activity-smoke rollout-vm-lab-snapshot-execute-smoke rollout-vm-lab-snapshot-all-smoke api-cli-smoke api-cli-validate api-cli-token-create api-cli-token-list api-cli-token-revoke clean + +all: build build-deliverer build-veriflier + +build: + mkdir -p bin + $(GO_ENV) CGO_ENABLED=0 $(GO) build $(BUILD_FLAGS) -o $(BINARY) ./cmd/jetmon2/ + +build-deliverer: + mkdir -p bin + $(GO_ENV) CGO_ENABLED=0 $(GO) build $(BUILD_FLAGS) -o $(DELIVERER) ./cmd/jetmon-deliverer/ + +build-veriflier: + mkdir -p bin + $(GO_ENV) CGO_ENABLED=0 $(GO) build $(BUILD_FLAGS) -o $(VERIFLIER) ./veriflier2/cmd/ + + +generate: + protoc --go_out=. --go_opt=paths=source_relative \ + --go-grpc_out=. --go-grpc_opt=paths=source_relative \ + proto/veriflier.proto + +test: + $(GO_ENV) $(GO) test ./... + +test-race: + $(GO_ENV) $(GO) test -race ./... + +test-veriflier-soak: + $(GO_ENV) $(GO) test ./internal/veriflier ./cmd/jetmon2 -run 'Test(V2Soak|VeriflierDiscoverySoak)' + +lint: + $(GO_ENV) $(GO) vet ./... + +vet: lint + +rollout-docs-verify: all test lint + scripts/rollout-docs-verify.sh + +rollout-rehearsal-verify: build + scripts/rollout-rehearsal-verify.sh + +rollout-vm-lab-sync: + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'mkdir -p ~/jetmon-rollout-tools/scripts ~/jetmon-rollout-tools/docs' + rsync -e "$(ROLLOUT_VM_LAB_SSH)" -a scripts/rollout-vm-lab.sh $(ROLLOUT_VM_LAB_HOST):~/jetmon-rollout-tools/scripts/ + rsync -e "$(ROLLOUT_VM_LAB_SSH)" -a docs/rollout-vm-lab.md $(ROLLOUT_VM_LAB_HOST):~/jetmon-rollout-tools/docs/ + +rollout-vm-lab-sync-artifacts: build rollout-vm-lab-sync + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'mkdir -p ~/jetmon-rollout-tools/bin ~/jetmon-rollout-tools/systemd ~/jetmon-rollout-tools/config' + rsync -e "$(ROLLOUT_VM_LAB_SSH)" -a bin/jetmon2 $(ROLLOUT_VM_LAB_HOST):~/jetmon-rollout-tools/bin/ + rsync -e "$(ROLLOUT_VM_LAB_SSH)" -a systemd/jetmon2.service systemd/jetmon2-logrotate $(ROLLOUT_VM_LAB_HOST):~/jetmon-rollout-tools/systemd/ + rsync -e "$(ROLLOUT_VM_LAB_SSH)" -a config/config-sample.json config/db-config-sample.conf $(ROLLOUT_VM_LAB_HOST):~/jetmon-rollout-tools/config/ + +rollout-vm-lab-stage-v2: rollout-vm-lab-sync-artifacts + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh install-v2' + +rollout-vm-lab-doctor: rollout-vm-lab-sync + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh doctor' + +rollout-vm-lab-prepare: rollout-vm-lab-sync-artifacts + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh prepare-topology' + +rollout-vm-lab-smoke: rollout-vm-lab-stage-v2 + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh smoke-preflight' + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh smoke-guided-dry-run' + +rollout-vm-lab-execute-smoke: rollout-vm-lab-stage-v2 + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh smoke-guided-execute-rollback' + +rollout-vm-lab-failure-smoke: rollout-vm-lab-stage-v2 + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh smoke-failure-gates' + +rollout-vm-lab-resume-smoke: rollout-vm-lab-stage-v2 + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh smoke-interrupted-resume' + +rollout-vm-lab-post-start-rollback-smoke: rollout-vm-lab-stage-v2 + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh smoke-post-start-rollback' + +rollout-vm-lab-bad-ssh-smoke: rollout-vm-lab-stage-v2 + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh smoke-bad-ssh' + +rollout-vm-lab-v2-start-failure-smoke: rollout-vm-lab-stage-v2 + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh smoke-v2-start-failure' + +rollout-vm-lab-runtime-guard-smoke: rollout-vm-lab-stage-v2 + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh smoke-runtime-guards' + +rollout-vm-lab-real-activity-smoke: rollout-vm-lab-stage-v2 + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh smoke-real-activity' + +rollout-vm-lab-snapshot-execute-smoke: rollout-vm-lab-stage-v2 + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh snapshot-run $(ROLLOUT_VM_LAB_SNAPSHOT) execute-rollback' + +rollout-vm-lab-snapshot-all-smoke: rollout-vm-lab-stage-v2 + $(ROLLOUT_VM_LAB_SSH) $(ROLLOUT_VM_LAB_HOST) 'cd ~/jetmon-rollout-tools && scripts/rollout-vm-lab.sh snapshot-run-all $(ROLLOUT_VM_LAB_SNAPSHOT)' + +api-cli-smoke: build + @test -n "$$JETMON_API_TOKEN" || { echo "JETMON_API_TOKEN is required"; exit 1; } + $(BINARY) api health --pretty + $(BINARY) api me --pretty + $(BINARY) api sites bulk-add --count 3 --batch $(API_SMOKE_BATCH) --dry-run --pretty + $(BINARY) api smoke --batch $(API_SMOKE_BATCH) --pretty $(API_SMOKE_ARGS) + +api-cli-validate: build + API_CLI_BINARY=$(BINARY) \ + API_VALIDATE_BATCH=$(API_VALIDATE_BATCH) \ + API_VALIDATE_COUNT=$(API_VALIDATE_COUNT) \ + API_VALIDATE_MODE=$(API_VALIDATE_MODE) \ + API_VALIDATE_WAIT=$(API_VALIDATE_WAIT) \ + API_VALIDATE_WEBHOOK_WAIT=$(API_VALIDATE_WEBHOOK_WAIT) \ + API_VALIDATE_SKIP_WEBHOOK=$(API_VALIDATE_SKIP_WEBHOOK) \ + API_VALIDATE_SKIP_FAILURE=$(API_VALIDATE_SKIP_FAILURE) \ + scripts/api-cli-validate.sh + +api-cli-token-create: + $(DOCKER_COMPOSE) exec jetmon ./jetmon2 keys create --consumer $(API_CLI_TOKEN_CONSUMER) --scope $(API_CLI_TOKEN_SCOPE) --ttl $(API_CLI_TOKEN_TTL) --created-by $(API_CLI_TOKEN_CREATED_BY) + +api-cli-token-list: + $(DOCKER_COMPOSE) exec jetmon ./jetmon2 keys list + +api-cli-token-revoke: + @test -n "$(API_CLI_TOKEN_ID)" || { echo "API_CLI_TOKEN_ID is required"; exit 1; } + $(DOCKER_COMPOSE) exec jetmon ./jetmon2 keys revoke $(API_CLI_TOKEN_ID) + +clean: + rm -f $(BINARY) $(DELIVERER) $(VERIFLIER) diff --git a/README.md b/README.md index b6c90975..c99caed9 100644 --- a/README.md +++ b/README.md @@ -1,97 +1,177 @@ -jetmon.js -========= - -Overview --------- - -Parallel HTTP health monitoring using HEAD requests for large scale website monitoring. - -The service relies on confirmation from external servers to verify that sites are indeed offline. This mitigates the Internet weather issue sometimes giving false positives. The code for these servers can be found in the verifliers directory. - -Architecture --------- -![jetmon_chart](https://user-images.githubusercontent.com/1758399/201877599-8992b68a-9ca7-4984-9de7-abe99f989d88.png) - -Jetmon will periodically (every 5 minutes) loop over a list of Jetpack sites and perform a HEAD request to check their current status. - -When a status change is detected, Jetmon will notify WPCOM including the related notification data in the request. - -Here are the possible flows, depending on the status change: - -| Previous Status | Current status | Action | -| ---------------- | ---------------- | ---------------------------------------------------------------------------------- | -| DOWN | UP | Notify WPCOM about status change | -| UP | DOWN | Verify status down via the Veriflier services and notify WPCOM about status change | -| DOWN | DOWN (confirmed) | Notify WPCOM about status change | - -### Jetmon service - -The Jetmon master service is responsible for communicating with the database in order to fetch a list of sites to check. It will spawn and re-allocate workers every five seconds and update stats repeatedly based on `STATS_UPDATE_INTERVAL_MS`. - -The jetmon-workers internally use an Node Addon written in C++ to check the connection by sending a HEAD request to the server. - - -### Verifliers - -The Veriflier service, which is written in C++ and uses the QT Framework, does something similar to the Node Addon mentioned before, but lives in its own server. Note that the production environment consists of multiple Verifliers, though the local development environment consists of a single Veriflier service. - -### Notification data - -Here are the current notification data, Jetmon sends to WPCOM upon detecting a site status change: -- `blog_id`: The site's WPCOM ID -- `monitor_url`: The URL Jetmon checked -- `status_id`: The site's current status. Enum: `0` is status down, `1` is status running and `2` status confirmed down. -- `last_check`: The datetime of the last check -- `last_status_change`: The datetime of the last status change -- `checks`: An array of the checks results from both Jetmon and Veriflier services. Each entry consists of: - - `type`: Enum: `1` refers to a Jetmon check, while `2` to a Veriflier check. - - `host`: The server hostname. - - `status`: The site's current status. Enum: `0` is status down, `1` is status running and `2` status confirmed down. - - `rtt`: Round-trip time (RTT) in milliseconds (ms). - - `code`: The HTTP response status code. - - -Installation ------------- - -1) Make sure you have installed [Docker](https://docs.docker.com/get-docker/) and [docker-compose](https://docs.docker.com/compose/install/) - -2) Clone the Jetmon monorepo - -3) Copy the environment variables file from within the `docker` folder: `cp jetmon/docker/.env-sample jetmon/docker/.env` - -4) Open `jetmon/docker/.env` and make any modifications you'd like. - -5) Run `docker compose build` from within the `docker` folder - -Configuration -------------- - -The Jetmon configuration lives under `config/config.json`. This file is generated on the fly, if not present, each time you run the Jetmon service, using the `config-sample.json` and the corresponding environment variables defined in `docker/.env`. -Feel free to modify your local config file as needed. - -The Veriflier configuration lives under `veriflier/config/veriflier.json`. This file is generated on the fly, if not present, each time you run the Veriflier service, using the `veriflier-sample.json` and the corresponding environment variables defined in `docker/.env`. - -Running -------- - -Run `docker compose up -d` from within the `docker` folder. - -Database -------- - -Main Table Schema: - - CREATE TABLE `jetpack_monitor_sites` ( - `jetpack_monitor_site_id` bigint(20) unsigned NOT NULL AUTO_INCREMENT PRIMARY KEY, - `blog_id` bigint(20) unsigned NOT NULL, - `bucket_no` smallint(2) unsigned NOT NULL, - `monitor_url` varchar(300) NOT NULL, - `monitor_active` tinyint(1) unsigned NOT NULL DEFAULT 1, - `site_status` tinyint(1) unsigned NOT NULL DEFAULT 1, - `last_status_change` timestamp NULL DEFAULT current_timestamp(), - `check_interval` tinyint(1) unsigned NOT NULL DEFAULT 5, - INDEX `blog_id_monitor_url` (`blog_id`, `monitor_url`), - INDEX `bucket_no_monitor_active_check_interval` (`bucket_no`, `monitor_active`, `check_interval`) - ); +# Jetmon 2 +Jetmon 2 is the Go rewrite of Jetpack's uptime monitor: the same production +contract v1 consumers depend on, with a cleaner runtime, an event-sourced health +model, richer diagnostics, and API-first automation. + +The core detection story stays familiar: + +```text +local checks -> local retries -> geo Veriflier confirmation -> notify +``` + +The first difference is correctness: v2 checks sites with `GET`, not the +`HEAD`-only probes that made v1 disagree with real visitor behavior on too many +VIP and Agency sites. Around that more realistic probe, Jetmon 2 records what +it saw, why it believed a site was down, which Verifliers agreed, which +notifications were sent, and how every incident changed over time. It turns "up +or down" into an auditable health platform. + +## Why This Matters + +| Audience | What Gets Better | +|---|---| +| Systems | Static Go binaries, no `npm`, `node-gyp`, Qt, or worker process tree. Bucket ownership is coordinated in MySQL, hosts drain cleanly, and memory pressure is handled inside the goroutine pool. | +| VIP and Agency | GET-based checks that match customer-visible behavior better than v1's HEAD probes, plus fewer noisy pages and fewer missed incidents through local retries, Veriflier quorum, maintenance windows, keyword checks, redirect policy, SSL/TLS checks, and clearer failure classes. | +| Leadership | A foundation for differentiated uptime monitoring: internal API, webhooks, managed alert contacts, tenant-aware gateway paths, and future Jetpack/WPCOM integrations. | +| Happiness Engineers | Incident answers with evidence: audit logs, event transitions, check timing, Veriflier votes, WPCOM payloads, and suppression reasons are all queryable. | +| Jetpack | A monitor that can grow into a product surface, not just a backend notification hook. | + +## What Changed + +| Area | Jetmon 1 | Jetmon 2 | +|---|---|---| +| Runtime | Node master, Node workers, C++ native addon, Qt Veriflier | Go monitor, Go Veriflier, optional Go deliverer | +| Probe method | `HEAD` requests that could disagree with real page loads | `GET` requests for local checks and Veriflier checks | +| State | Mutable `site_status` projection | `jetmon_events` plus append-only `jetmon_event_transitions` | +| Detection | Binary status changes | `Seems Down`, `Down`, recovery, false-alarm, and severity transitions | +| Evidence | Basic logs | Audit log, check history, timing breakdown, verifier outcomes, API request logs | +| Integrations | WPCOM notification path | WPCOM, REST API, HMAC webhooks, email, PagerDuty, Slack, Teams | +| Operations | Static bucket config and process recycling | Dynamic bucket ownership, graceful drain, hot reload, dashboard, pprof | + +Jetmon 2 keeps the compatibility surfaces that matter during rollout: + +- MySQL changes are additive. +- WPCOM notification payloads stay compatible. +- StatsD metric naming remains `com.jetpack.jetmon.`. +- Legacy log and stats file paths remain available. +- `jetpack_monitor_sites.site_status` can be projected from v2 events during + the [v1-to-v2 migration](docs/v1-to-v2-migration.md). + +## How Incidents Flow + +1. The monitor checks active sites with a bounded Go worker pool. +2. A first local failure opens a `Seems Down` event so the incident start time is + honest. +3. Local retries absorb one-off network blips before customer notification. +4. Geo-distributed Verifliers confirm or reject the outage. +5. Confirmed outages become `Down`; rejected outages close as false alarms. +6. WPCOM, webhooks, alert contacts, the dashboard, and the API all read from the + same event and transition history. + +That model gives operators and support teams the part v1 could not: a coherent +timeline for every incident, not just the final status bit. + +## Try It Locally + +Docker Compose is the fastest path for local development: + +```bash +cd docker +cp .env-sample .env +docker compose up --build -d +``` + +Build and test from the repository root: + +```bash +make all +make test +make test-race +``` + +The API CLI can exercise the internal REST API and local failure fixture: + +```bash +make build +make api-cli-token-create + +export JETMON_API_URL=http://localhost:${API_HOST_PORT:-8090} +export JETMON_API_TOKEN=jm_replace_with_the_printed_token + +./bin/jetmon2 api health --pretty +./bin/jetmon2 api commands --output table +make api-cli-smoke +``` + +See [docs/getting-started.md](docs/getting-started.md) for the full local loop. + +## Documentation + +| Document | Start Here For | +|---|---| +| [docs/project.md](docs/project.md) | Full product and implementation specification | +| [docs/internal-api-reference.md](docs/internal-api-reference.md) | Internal REST API reference | +| [docs/events.md](docs/events.md) | Event lifecycle and transition semantics | +| [docs/taxonomy.md](docs/taxonomy.md) | Severity, state, cause, and rollup taxonomy | +| [docs/getting-started.md](docs/getting-started.md) | Docker setup, builds, tests, API CLI smoke runs | +| [docs/docker-images.md](docs/docker-images.md) | Pulling and running the published GHCR images | +| [docs/operations-guide.md](docs/operations-guide.md) | Production config, rollout, delivery workers, metrics, debugging | +| [docs/data-model.md](docs/data-model.md) | Tables, migrations, event projection, tenant mapping | +| [docs/support-guide.md](docs/support-guide.md) | HE workflows for explaining alerts and missed alerts | +| [docs/api-cli-guide.md](docs/api-cli-guide.md) | API CLI examples and automation patterns | +| [docs/v1-to-v2-migration.md](docs/v1-to-v2-migration.md) | Full v1-to-v2 production migration and rollback runbook | +| [docs/jetmon-deliverer-rollout.md](docs/jetmon-deliverer-rollout.md) | Moving outbound delivery to `jetmon-deliverer` | +| [docs/roadmap.md](docs/roadmap.md) | Broader v2 and v3 planning | + +Longer design decisions live in [docs/adr/](docs/adr/). + +## Production Posture + +Jetmon 2 is designed for a cautious host-by-host rollout. The complete process +is in [docs/v1-to-v2-migration.md](docs/v1-to-v2-migration.md). Use +[docs/rollout-quick-reference.md](docs/rollout-quick-reference.md) as the +one-page command checklist during rehearsals and rollout windows: + +- Run `./jetmon2 migrate` before first start. Migrations are embedded and + additive. +- Run `./jetmon2 validate-config` before deploy to check config shape, + database connectivity, email transport mode, verifier config, and rollout + safety commands. +- Use pinned bucket mode for the first v1-to-v2 migration so one v1 host can be + replaced by one v2 host with the same bucket range. +- Prefer `rollout guided` during production rollout windows so operators get a + transcript, resume state, typed confirmations, and fail-closed rollout gates. + Run it from the staged v2 runtime host. For fresh-server takeovers, that + runtime host must have SSH access to the old v1 host when the configured v1 + stop/start commands use SSH. + Use `rollout static-plan-check`, `rollout host-preflight`, + `rollout cutover-check`, `rollout rollback-check`, and targeted + `rollout activity-check` / `rollout projection-drift` from the migration + runbook before changing the next host. Use `rollout state-report` for a + quick handoff snapshot. +- Keep `LEGACY_STATUS_PROJECTION_ENABLE` on until legacy readers have moved to + the v2 API or event tables. +- Use `SIGINT` or `./jetmon2 drain` for graceful shutdown. +- Use `SIGHUP` or `./jetmon2 reload` for config reload without restart. +- Use the host dashboard at `/` and the fleet dashboard at `/fleet` during + rollout windows. Keep `DASHBOARD_BIND_ADDR` on loopback unless the listener is + protected by trusted operator-network controls. + +After the fleet is fully on v2, dynamic bucket ownership lets surviving hosts +absorb work during rolling updates. + +## Main Binaries + +| Binary | Purpose | +|---|---| +| `bin/jetmon2` | Monitor, orchestrator, REST API, dashboard, embedded delivery workers | +| `bin/veriflier2` | Remote confirmation worker used by the monitor | +| `bin/jetmon-deliverer` | Standalone webhook and alert-contact delivery worker | + +## Development Commands + +```bash +make all # Build jetmon2, jetmon-deliverer, and veriflier2 +make build # Build only jetmon2 +make build-deliverer # Build only jetmon-deliverer +make build-veriflier # Build only veriflier2 +make test # Run the Go test suite +make test-race # Run tests with the race detector +make lint # Run lint checks +make rollout-docs-verify # Verify rollout docs/tooling alignment +``` + +`make generate` is intentionally separate. It requires `protoc` and Go protobuf +plugins, and the generated stubs are not part of the production JSON-over-HTTP +Veriflier transport. diff --git a/binding.gyp b/binding.gyp deleted file mode 100644 index 7e0e2186..00000000 --- a/binding.gyp +++ /dev/null @@ -1,17 +0,0 @@ -{ - 'targets':[ { - 'target_name':'jetmon', - 'cflags_cc': [ '-fexceptions','-O3' ], - 'sources':[ - './src/main.cpp', - './src/http_checker.cpp', - ], - 'conditions': [ - ['node_shared_openssl=="false"', { - 'include_dirs': [ - '<(node_root_dir)/deps/openssl/openssl/include' - ], - }] - ] - } ] -} diff --git a/cmd/jetmon-deliverer/delivery_check.go b/cmd/jetmon-deliverer/delivery_check.go new file mode 100644 index 00000000..4fdbe0a0 --- /dev/null +++ b/cmd/jetmon-deliverer/delivery_check.go @@ -0,0 +1,416 @@ +package main + +import ( + "context" + "database/sql" + "encoding/json" + "errors" + "flag" + "fmt" + "io" + "os" + "strings" + "text/tabwriter" + "time" + + "github.com/Automattic/jetmon/internal/config" + "github.com/Automattic/jetmon/internal/db" +) + +const deliveryCheckDefaultSince = "15m" + +type deliveryCheckOptions struct { + HostOverride string + Since string + Output string + MaxPending int64 + MaxDue int64 + MaxAbandoned int64 + MaxFailed int64 + RequireRecentDelivery bool + RequireRecentWebhookDelivery bool + RequireRecentAlertDelivery bool +} + +type deliveryTableSummary struct { + Kind string `json:"kind"` + Pending int64 `json:"pending"` + DueNow int64 `json:"due_now"` + FutureRetry int64 `json:"future_retry"` + DeliveredSince int64 `json:"delivered_since"` + AbandonedSince int64 `json:"abandoned_since"` + FailedSince int64 `json:"failed_since"` + OldestPendingAgeSec int64 `json:"oldest_pending_age_sec"` + OldestDueAgeSec int64 `json:"oldest_due_age_sec"` +} + +type deliveryCheckReport struct { + OK bool `json:"ok"` + Host string `json:"host"` + GeneratedAt time.Time `json:"generated_at"` + Since time.Time `json:"since"` + OwnerLevel string `json:"owner_level,omitempty"` + OwnerMessage string `json:"owner_message,omitempty"` + Tables []deliveryTableSummary `json:"tables"` + Total deliveryTableSummary `json:"total"` + Failures []string `json:"failures,omitempty"` +} + +func parseDeliveryCheckOptions(args []string) (deliveryCheckOptions, error) { + opts := deliveryCheckOptions{ + Since: deliveryCheckDefaultSince, + Output: "text", + MaxPending: -1, + MaxDue: -1, + MaxAbandoned: -1, + MaxFailed: -1, + } + fs := flag.NewFlagSet("delivery-check", flag.ContinueOnError) + fs.SetOutput(io.Discard) + fs.StringVar(&opts.HostOverride, "host", "", "host id to use for DELIVERY_OWNER_HOST context (default current hostname)") + fs.StringVar(&opts.Since, "since", deliveryCheckDefaultSince, "report cutoff as duration like 15m or RFC3339 timestamp") + fs.StringVar(&opts.Output, "output", "text", "output format: text or json") + fs.Int64Var(&opts.MaxPending, "max-pending", -1, "fail when total pending deliveries exceed this count (-1 disables)") + fs.Int64Var(&opts.MaxDue, "max-due", -1, "fail when total due deliveries exceed this count (-1 disables)") + fs.Int64Var(&opts.MaxAbandoned, "max-abandoned", -1, "fail when abandoned deliveries since cutoff exceed this count (-1 disables)") + fs.Int64Var(&opts.MaxFailed, "max-failed", -1, "fail when failed deliveries since cutoff exceed this count (-1 disables)") + fs.BoolVar(&opts.RequireRecentDelivery, "require-recent-delivery", false, "fail unless at least one delivery succeeded since cutoff") + fs.BoolVar(&opts.RequireRecentWebhookDelivery, "require-recent-webhook-delivery", false, "fail unless at least one webhook delivery succeeded since cutoff") + fs.BoolVar(&opts.RequireRecentAlertDelivery, "require-recent-alert-delivery", false, "fail unless at least one alert-contact delivery succeeded since cutoff") + if err := fs.Parse(args); err != nil { + return opts, err + } + if fs.NArg() != 0 { + return opts, fmt.Errorf("unexpected argument %q", fs.Arg(0)) + } + opts.Output = strings.ToLower(strings.TrimSpace(opts.Output)) + if opts.Output != "text" && opts.Output != "json" { + return opts, fmt.Errorf("--output must be text or json") + } + if opts.MaxPending < -1 { + return opts, fmt.Errorf("--max-pending must be >= 0, or -1 to disable") + } + if opts.MaxDue < -1 { + return opts, fmt.Errorf("--max-due must be >= 0, or -1 to disable") + } + if opts.MaxAbandoned < -1 { + return opts, fmt.Errorf("--max-abandoned must be >= 0, or -1 to disable") + } + if opts.MaxFailed < -1 { + return opts, fmt.Errorf("--max-failed must be >= 0, or -1 to disable") + } + return opts, nil +} + +func cmdDeliveryCheck(args []string) { + opts, err := parseDeliveryCheckOptions(args) + if err != nil { + fmt.Fprintln(os.Stderr, "usage: jetmon-deliverer delivery-check [--host=] [--since=15m] [--max-pending=N] [--max-due=N] [--max-abandoned=N] [--max-failed=N] [--require-recent-delivery] [--require-recent-webhook-delivery] [--require-recent-alert-delivery] [--output=text|json]") + fmt.Fprintf(os.Stderr, "FAIL %v\n", err) + os.Exit(2) + } + emitProgress := opts.Output != "json" + + configPath := envOrDefault("JETMON_CONFIG", "config/config.json") + if err := config.Load(configPath); err != nil { + fmt.Fprintf(os.Stderr, "FAIL config parse: %v\n", err) + os.Exit(1) + } + if emitProgress { + fmt.Println("PASS config parse") + } + + config.LoadDB() + if err := db.ConnectWithRetry(3); err != nil { + fmt.Fprintf(os.Stderr, "FAIL db connect: %v\n", err) + os.Exit(1) + } + if emitProgress { + fmt.Println("PASS db connect") + } + + hostID := strings.TrimSpace(opts.HostOverride) + if hostID == "" { + hostID = db.Hostname() + } + report, err := buildDeliveryCheckReport(context.Background(), db.DB(), config.Get(), hostID, opts, time.Now().UTC()) + if err != nil { + fmt.Fprintf(os.Stderr, "FAIL delivery check: %v\n", err) + os.Exit(1) + } + if err := renderDeliveryCheckReport(os.Stdout, report, opts.Output); err != nil { + fmt.Fprintf(os.Stderr, "FAIL render delivery check: %v\n", err) + os.Exit(1) + } + if !report.OK { + os.Exit(1) + } +} + +func buildDeliveryCheckReport(ctx context.Context, conn *sql.DB, cfg *config.Config, hostID string, opts deliveryCheckOptions, now time.Time) (deliveryCheckReport, error) { + if conn == nil { + return deliveryCheckReport{}, errors.New("database handle is nil") + } + now = now.UTC() + cutoff, err := resolveDeliveryCheckCutoff(now, opts.Since) + if err != nil { + return deliveryCheckReport{}, err + } + hostID = strings.TrimSpace(hostID) + + report := deliveryCheckReport{ + Host: hostID, + GeneratedAt: now, + Since: cutoff, + Total: deliveryTableSummary{Kind: "total"}, + } + if cfg != nil { + report.OwnerLevel, report.OwnerMessage = deliveryOwnerStatus(cfg, hostID) + } + + tables := []struct { + kind string + name string + }{ + {kind: "webhook", name: "jetmon_webhook_deliveries"}, + {kind: "alert", name: "jetmon_alert_deliveries"}, + } + for _, table := range tables { + summary, err := queryDeliveryTableSummary(ctx, conn, table.kind, table.name, now, cutoff) + if err != nil { + return deliveryCheckReport{}, err + } + report.Tables = append(report.Tables, summary) + report.Total.Pending += summary.Pending + report.Total.DueNow += summary.DueNow + report.Total.FutureRetry += summary.FutureRetry + report.Total.DeliveredSince += summary.DeliveredSince + report.Total.AbandonedSince += summary.AbandonedSince + report.Total.FailedSince += summary.FailedSince + report.Total.OldestPendingAgeSec = maxInt64(report.Total.OldestPendingAgeSec, summary.OldestPendingAgeSec) + report.Total.OldestDueAgeSec = maxInt64(report.Total.OldestDueAgeSec, summary.OldestDueAgeSec) + } + + report.Failures = evaluateDeliveryCheckFailures(report, opts) + report.OK = len(report.Failures) == 0 + return report, nil +} + +func resolveDeliveryCheckCutoff(now time.Time, raw string) (time.Time, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return time.Time{}, errors.New("--since must not be empty") + } + if d, err := time.ParseDuration(raw); err == nil { + if d <= 0 { + return time.Time{}, errors.New("--since duration must be > 0") + } + return now.Add(-d).UTC(), nil + } + cutoff, err := time.Parse(time.RFC3339, raw) + if err != nil { + return time.Time{}, fmt.Errorf("--since must be a duration or RFC3339 timestamp") + } + if cutoff.After(now) { + return time.Time{}, errors.New("--since timestamp must not be in the future") + } + return cutoff.UTC(), nil +} + +func queryDeliveryTableSummary(ctx context.Context, conn *sql.DB, kind, table string, now, cutoff time.Time) (deliveryTableSummary, error) { + switch table { + case "jetmon_webhook_deliveries", "jetmon_alert_deliveries": + default: + return deliveryTableSummary{}, fmt.Errorf("unsupported delivery table %q", table) + } + + summary := deliveryTableSummary{Kind: kind} + + pendingQuery := fmt.Sprintf(` + SELECT COUNT(*), + COALESCE(TIMESTAMPDIFF(SECOND, MIN(created_at), ?), 0) + FROM %s + WHERE status = 'pending'`, table) + if err := conn.QueryRowContext(ctx, pendingQuery, now).Scan( + &summary.Pending, + &summary.OldestPendingAgeSec, + ); err != nil { + return deliveryTableSummary{}, fmt.Errorf("%s pending delivery summary: %w", kind, err) + } + + dueQuery := fmt.Sprintf(` + SELECT COUNT(*), + COALESCE(TIMESTAMPDIFF(SECOND, MIN(COALESCE(next_attempt_at, created_at)), ?), 0) + FROM %s + WHERE status = 'pending' + AND (next_attempt_at IS NULL OR next_attempt_at <= ?)`, table) + if err := conn.QueryRowContext(ctx, dueQuery, now, now).Scan( + &summary.DueNow, + &summary.OldestDueAgeSec, + ); err != nil { + return deliveryTableSummary{}, fmt.Errorf("%s due delivery summary: %w", kind, err) + } + + futureQuery := fmt.Sprintf(` + SELECT COUNT(*) + FROM %s + WHERE status = 'pending' + AND next_attempt_at > ?`, table) + if err := conn.QueryRowContext(ctx, futureQuery, now).Scan(&summary.FutureRetry); err != nil { + return deliveryTableSummary{}, fmt.Errorf("%s future delivery summary: %w", kind, err) + } + + deliveredQuery := fmt.Sprintf(` + SELECT COUNT(*) + FROM %s + WHERE status = 'delivered' + AND delivered_at >= ?`, table) + if err := conn.QueryRowContext(ctx, deliveredQuery, cutoff).Scan(&summary.DeliveredSince); err != nil { + return deliveryTableSummary{}, fmt.Errorf("%s delivered summary: %w", kind, err) + } + + abandonedSince, err := queryRecentTerminalDeliveryCount(ctx, conn, table, "abandoned", cutoff) + if err != nil { + return deliveryTableSummary{}, fmt.Errorf("%s abandoned summary: %w", kind, err) + } + summary.AbandonedSince = abandonedSince + + failedSince, err := queryRecentTerminalDeliveryCount(ctx, conn, table, "failed", cutoff) + if err != nil { + return deliveryTableSummary{}, fmt.Errorf("%s failed summary: %w", kind, err) + } + summary.FailedSince = failedSince + summary.OldestPendingAgeSec = maxInt64(0, summary.OldestPendingAgeSec) + summary.OldestDueAgeSec = maxInt64(0, summary.OldestDueAgeSec) + return summary, nil +} + +func queryRecentTerminalDeliveryCount(ctx context.Context, conn *sql.DB, table, status string, cutoff time.Time) (int64, error) { + switch table { + case "jetmon_webhook_deliveries", "jetmon_alert_deliveries": + default: + return 0, fmt.Errorf("unsupported delivery table %q", table) + } + switch status { + case "abandoned", "failed": + default: + return 0, fmt.Errorf("unsupported terminal status %q", status) + } + + withAttemptQuery := fmt.Sprintf(` + SELECT COUNT(*) + FROM %s + WHERE status = ? + AND last_attempt_at >= ?`, table) + var withAttempt int64 + if err := conn.QueryRowContext(ctx, withAttemptQuery, status, cutoff).Scan(&withAttempt); err != nil { + return 0, err + } + + createdFallbackQuery := fmt.Sprintf(` + SELECT COUNT(*) + FROM %s + WHERE status = ? + AND last_attempt_at IS NULL + AND created_at >= ?`, table) + var createdFallback int64 + if err := conn.QueryRowContext(ctx, createdFallbackQuery, status, cutoff).Scan(&createdFallback); err != nil { + return 0, err + } + return withAttempt + createdFallback, nil +} + +func evaluateDeliveryCheckFailures(report deliveryCheckReport, opts deliveryCheckOptions) []string { + var failures []string + if opts.MaxPending >= 0 && report.Total.Pending > opts.MaxPending { + failures = append(failures, fmt.Sprintf("pending deliveries total=%d exceeds max-pending=%d", report.Total.Pending, opts.MaxPending)) + } + if opts.MaxDue >= 0 && report.Total.DueNow > opts.MaxDue { + failures = append(failures, fmt.Sprintf("due deliveries total=%d exceeds max-due=%d", report.Total.DueNow, opts.MaxDue)) + } + if opts.MaxAbandoned >= 0 && report.Total.AbandonedSince > opts.MaxAbandoned { + failures = append(failures, fmt.Sprintf("abandoned deliveries since %s total=%d exceeds max-abandoned=%d", report.Since.Format(time.RFC3339), report.Total.AbandonedSince, opts.MaxAbandoned)) + } + if opts.MaxFailed >= 0 && report.Total.FailedSince > opts.MaxFailed { + failures = append(failures, fmt.Sprintf("failed deliveries since %s total=%d exceeds max-failed=%d", report.Since.Format(time.RFC3339), report.Total.FailedSince, opts.MaxFailed)) + } + if opts.RequireRecentDelivery && report.Total.DeliveredSince == 0 { + failures = append(failures, fmt.Sprintf("no delivered rows since %s", report.Since.Format(time.RFC3339))) + } + if opts.RequireRecentWebhookDelivery && deliveredSince(report, "webhook") == 0 { + failures = append(failures, fmt.Sprintf("no webhook deliveries since %s", report.Since.Format(time.RFC3339))) + } + if opts.RequireRecentAlertDelivery && deliveredSince(report, "alert") == 0 { + failures = append(failures, fmt.Sprintf("no alert-contact deliveries since %s", report.Since.Format(time.RFC3339))) + } + return failures +} + +func renderDeliveryCheckReport(out io.Writer, report deliveryCheckReport, output string) error { + if output == "json" { + enc := json.NewEncoder(out) + enc.SetIndent("", " ") + return enc.Encode(report) + } + return renderDeliveryCheckText(out, report) +} + +func renderDeliveryCheckText(out io.Writer, report deliveryCheckReport) error { + fmt.Fprintf(out, "INFO deliverer_host=%q\n", report.Host) + fmt.Fprintf(out, "INFO delivery_check_generated_at=%s\n", report.GeneratedAt.Format(time.RFC3339)) + fmt.Fprintf(out, "INFO delivery_check_since=%s\n", report.Since.Format(time.RFC3339)) + if report.OwnerMessage != "" { + fmt.Fprintf(out, "%s %s\n", report.OwnerLevel, report.OwnerMessage) + } + + tw := tabwriter.NewWriter(out, 0, 0, 2, ' ', 0) + fmt.Fprintln(tw, "KIND\tPENDING\tDUE_NOW\tFUTURE_RETRY\tDELIVERED_SINCE\tABANDONED_SINCE\tFAILED_SINCE\tOLDEST_PENDING_SEC\tOLDEST_DUE_SEC") + for _, summary := range report.Tables { + writeDeliverySummaryRow(tw, summary) + } + writeDeliverySummaryRow(tw, report.Total) + if err := tw.Flush(); err != nil { + return err + } + + if report.OK { + fmt.Fprintln(out, "PASS delivery_check=ok") + return nil + } + for _, failure := range report.Failures { + fmt.Fprintf(out, "FAIL %s\n", failure) + } + return nil +} + +func writeDeliverySummaryRow(out io.Writer, summary deliveryTableSummary) { + fmt.Fprintf( + out, + "%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n", + summary.Kind, + summary.Pending, + summary.DueNow, + summary.FutureRetry, + summary.DeliveredSince, + summary.AbandonedSince, + summary.FailedSince, + summary.OldestPendingAgeSec, + summary.OldestDueAgeSec, + ) +} + +func deliveredSince(report deliveryCheckReport, kind string) int64 { + for _, summary := range report.Tables { + if summary.Kind == kind { + return summary.DeliveredSince + } + } + return 0 +} + +func maxInt64(a, b int64) int64 { + if a > b { + return a + } + return b +} diff --git a/cmd/jetmon-deliverer/delivery_check_test.go b/cmd/jetmon-deliverer/delivery_check_test.go new file mode 100644 index 00000000..99963c51 --- /dev/null +++ b/cmd/jetmon-deliverer/delivery_check_test.go @@ -0,0 +1,366 @@ +package main + +import ( + "bytes" + "context" + "encoding/json" + "regexp" + "strings" + "testing" + "time" + + "github.com/Automattic/jetmon/internal/config" + "github.com/DATA-DOG/go-sqlmock" +) + +func TestParseDeliveryCheckOptions(t *testing.T) { + opts, err := parseDeliveryCheckOptions([]string{ + "--host=deliverer-1", + "--since=30m", + "--output=json", + "--max-pending=10", + "--max-due=0", + "--max-abandoned=1", + "--max-failed=2", + "--require-recent-delivery", + "--require-recent-webhook-delivery", + "--require-recent-alert-delivery", + }) + if err != nil { + t.Fatalf("parseDeliveryCheckOptions: %v", err) + } + if opts.HostOverride != "deliverer-1" { + t.Fatalf("HostOverride = %q, want deliverer-1", opts.HostOverride) + } + if opts.Since != "30m" || opts.Output != "json" { + t.Fatalf("parsed since/output = %q/%q", opts.Since, opts.Output) + } + if opts.MaxPending != 10 || opts.MaxDue != 0 || opts.MaxAbandoned != 1 || opts.MaxFailed != 2 { + t.Fatalf("parsed thresholds = pending:%d due:%d abandoned:%d failed:%d", opts.MaxPending, opts.MaxDue, opts.MaxAbandoned, opts.MaxFailed) + } + if !opts.RequireRecentDelivery || !opts.RequireRecentWebhookDelivery || !opts.RequireRecentAlertDelivery { + t.Fatalf("recent delivery flags = %+v, want all true", opts) + } + + defaults, err := parseDeliveryCheckOptions(nil) + if err != nil { + t.Fatalf("parseDeliveryCheckOptions(defaults): %v", err) + } + if defaults.Since != deliveryCheckDefaultSince || defaults.Output != "text" { + t.Fatalf("defaults = %+v", defaults) + } + if defaults.MaxPending != -1 || defaults.MaxDue != -1 || defaults.MaxAbandoned != -1 || defaults.MaxFailed != -1 { + t.Fatalf("default thresholds = %+v, want disabled", defaults) + } + + if _, err := parseDeliveryCheckOptions([]string{"--output=xml"}); err == nil { + t.Fatal("parseDeliveryCheckOptions accepted invalid output") + } + if _, err := parseDeliveryCheckOptions([]string{"--max-due=-2"}); err == nil { + t.Fatal("parseDeliveryCheckOptions accepted invalid threshold") + } + if _, err := parseDeliveryCheckOptions([]string{"--max-failed=-2"}); err == nil { + t.Fatal("parseDeliveryCheckOptions accepted invalid failed threshold") + } + if _, err := parseDeliveryCheckOptions([]string{"extra"}); err == nil { + t.Fatal("parseDeliveryCheckOptions accepted positional argument") + } +} + +func TestResolveDeliveryCheckCutoff(t *testing.T) { + now := time.Date(2026, 4, 29, 18, 30, 0, 0, time.UTC) + + durationCutoff, err := resolveDeliveryCheckCutoff(now, "45m") + if err != nil { + t.Fatalf("resolveDeliveryCheckCutoff(duration): %v", err) + } + if want := now.Add(-45 * time.Minute); !durationCutoff.Equal(want) { + t.Fatalf("duration cutoff = %s, want %s", durationCutoff, want) + } + + timestampCutoff, err := resolveDeliveryCheckCutoff(now, "2026-04-29T18:00:00Z") + if err != nil { + t.Fatalf("resolveDeliveryCheckCutoff(timestamp): %v", err) + } + if want := time.Date(2026, 4, 29, 18, 0, 0, 0, time.UTC); !timestampCutoff.Equal(want) { + t.Fatalf("timestamp cutoff = %s, want %s", timestampCutoff, want) + } + + for _, raw := range []string{"", "0s", "-1m", "not-time", "2026-04-29T19:00:00Z"} { + t.Run(raw, func(t *testing.T) { + if _, err := resolveDeliveryCheckCutoff(now, raw); err == nil { + t.Fatalf("resolveDeliveryCheckCutoff(%q) returned nil error", raw) + } + }) + } +} + +func TestBuildDeliveryCheckReportSummarizesAndAppliesThresholds(t *testing.T) { + sqlDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer sqlDB.Close() + + now := time.Date(2026, 4, 29, 18, 30, 0, 0, time.UTC) + cutoff := now.Add(-15 * time.Minute) + expectDeliverySummaryQueries(t, mock, "jetmon_webhook_deliveries", now, cutoff, deliveryTableSummary{ + Pending: 2, + DueNow: 1, + FutureRetry: 1, + DeliveredSince: 4, + AbandonedSince: 0, + FailedSince: 2, + OldestPendingAgeSec: 120, + OldestDueAgeSec: 60, + }) + expectDeliverySummaryQueries(t, mock, "jetmon_alert_deliveries", now, cutoff, deliveryTableSummary{ + Pending: 4, + DueNow: 2, + FutureRetry: 2, + DeliveredSince: 0, + AbandonedSince: 1, + FailedSince: 0, + OldestPendingAgeSec: 90, + OldestDueAgeSec: 30, + }) + + opts := deliveryCheckOptions{ + Since: "15m", + MaxPending: 5, + MaxDue: 2, + MaxAbandoned: 0, + MaxFailed: 1, + RequireRecentDelivery: true, + } + report, err := buildDeliveryCheckReport(context.Background(), sqlDB, &config.Config{ + DeliveryOwnerHost: "deliverer-1", + }, "deliverer-1", opts, now) + if err != nil { + t.Fatalf("buildDeliveryCheckReport: %v", err) + } + if report.OK { + t.Fatal("report.OK = true, want false because thresholds fail") + } + if report.Total.Pending != 6 || report.Total.DueNow != 3 || report.Total.FutureRetry != 3 { + t.Fatalf("total queue summary = %+v", report.Total) + } + if report.Total.DeliveredSince != 4 || report.Total.AbandonedSince != 1 { + t.Fatalf("total terminal summary = %+v", report.Total) + } + if report.Total.FailedSince != 2 || report.Total.OldestPendingAgeSec != 120 || report.Total.OldestDueAgeSec != 60 { + t.Fatalf("total failed/age summary = %+v", report.Total) + } + if report.OwnerLevel != "INFO" || !strings.Contains(report.OwnerMessage, "matched") { + t.Fatalf("owner status = %q %q", report.OwnerLevel, report.OwnerMessage) + } + wantFailures := []string{ + "pending deliveries total=6 exceeds max-pending=5", + "due deliveries total=3 exceeds max-due=2", + "abandoned deliveries since 2026-04-29T18:15:00Z total=1 exceeds max-abandoned=0", + "failed deliveries since 2026-04-29T18:15:00Z total=2 exceeds max-failed=1", + } + if len(report.Failures) != len(wantFailures) { + t.Fatalf("failures = %v, want %d failures", report.Failures, len(wantFailures)) + } + for i, want := range wantFailures { + if report.Failures[i] != want { + t.Fatalf("failure[%d] = %q, want %q", i, report.Failures[i], want) + } + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("sql expectations: %v", err) + } +} + +func TestBuildDeliveryCheckReportRequiresRecentDelivery(t *testing.T) { + sqlDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer sqlDB.Close() + + now := time.Date(2026, 4, 29, 18, 30, 0, 0, time.UTC) + cutoff := now.Add(-15 * time.Minute) + expectDeliverySummaryQueries(t, mock, "jetmon_webhook_deliveries", now, cutoff, deliveryTableSummary{}) + expectDeliverySummaryQueries(t, mock, "jetmon_alert_deliveries", now, cutoff, deliveryTableSummary{}) + + report, err := buildDeliveryCheckReport(context.Background(), sqlDB, &config.Config{}, "deliverer-1", deliveryCheckOptions{ + Since: "15m", + MaxPending: -1, + MaxDue: -1, + MaxAbandoned: -1, + MaxFailed: -1, + RequireRecentDelivery: true, + }, now) + if err != nil { + t.Fatalf("buildDeliveryCheckReport: %v", err) + } + if report.OK { + t.Fatal("report.OK = true, want false") + } + if len(report.Failures) != 1 || !strings.Contains(report.Failures[0], "no delivered rows since") { + t.Fatalf("failures = %v", report.Failures) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("sql expectations: %v", err) + } +} + +func TestBuildDeliveryCheckReportRequiresRecentDeliveryByKind(t *testing.T) { + sqlDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer sqlDB.Close() + + now := time.Date(2026, 4, 29, 18, 30, 0, 0, time.UTC) + cutoff := now.Add(-15 * time.Minute) + expectDeliverySummaryQueries(t, mock, "jetmon_webhook_deliveries", now, cutoff, deliveryTableSummary{DeliveredSince: 1}) + expectDeliverySummaryQueries(t, mock, "jetmon_alert_deliveries", now, cutoff, deliveryTableSummary{}) + + report, err := buildDeliveryCheckReport(context.Background(), sqlDB, &config.Config{}, "deliverer-1", deliveryCheckOptions{ + Since: "15m", + MaxPending: -1, + MaxDue: -1, + MaxAbandoned: -1, + MaxFailed: -1, + RequireRecentWebhookDelivery: true, + RequireRecentAlertDelivery: true, + }, now) + if err != nil { + t.Fatalf("buildDeliveryCheckReport: %v", err) + } + if report.OK { + t.Fatal("report.OK = true, want false") + } + if len(report.Failures) != 1 || !strings.Contains(report.Failures[0], "no alert-contact deliveries since") { + t.Fatalf("failures = %v", report.Failures) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("sql expectations: %v", err) + } +} + +func TestQueryRecentTerminalDeliveryCountUsesAttemptAndCreatedFallback(t *testing.T) { + sqlDB, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer sqlDB.Close() + + cutoff := time.Date(2026, 4, 29, 18, 15, 0, 0, time.UTC) + mock.ExpectQuery(`(?s)FROM jetmon_webhook_deliveries.*status = \?.*last_attempt_at >= \?`). + WithArgs("abandoned", cutoff). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(2)) + mock.ExpectQuery(`(?s)FROM jetmon_webhook_deliveries.*status = \?.*last_attempt_at IS NULL.*created_at >= \?`). + WithArgs("abandoned", cutoff). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) + + got, err := queryRecentTerminalDeliveryCount(context.Background(), sqlDB, "jetmon_webhook_deliveries", "abandoned", cutoff) + if err != nil { + t.Fatalf("queryRecentTerminalDeliveryCount: %v", err) + } + if got != 3 { + t.Fatalf("queryRecentTerminalDeliveryCount() = %d, want 3", got) + } + if _, err := queryRecentTerminalDeliveryCount(context.Background(), sqlDB, "bad_table", "abandoned", cutoff); err == nil { + t.Fatal("queryRecentTerminalDeliveryCount accepted bad table") + } + if _, err := queryRecentTerminalDeliveryCount(context.Background(), sqlDB, "jetmon_webhook_deliveries", "delivered", cutoff); err == nil { + t.Fatal("queryRecentTerminalDeliveryCount accepted bad status") + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("sql expectations: %v", err) + } +} + +func TestRenderDeliveryCheckReport(t *testing.T) { + report := deliveryCheckReport{ + OK: true, + Host: "deliverer-1", + GeneratedAt: time.Date(2026, 4, 29, 18, 30, 0, 0, time.UTC), + Since: time.Date(2026, 4, 29, 18, 15, 0, 0, time.UTC), + Tables: []deliveryTableSummary{ + {Kind: "webhook", Pending: 1, DueNow: 0, FutureRetry: 1, DeliveredSince: 2, FailedSince: 1, OldestPendingAgeSec: 120}, + {Kind: "alert", DeliveredSince: 3}, + }, + Total: deliveryTableSummary{Kind: "total", Pending: 1, FutureRetry: 1, DeliveredSince: 5, FailedSince: 1, OldestPendingAgeSec: 120}, + } + + var textOut bytes.Buffer + if err := renderDeliveryCheckReport(&textOut, report, "text"); err != nil { + t.Fatalf("renderDeliveryCheckReport(text): %v", err) + } + text := textOut.String() + for _, want := range []string{"INFO deliverer_host=\"deliverer-1\"", "FAILED_SINCE", "OLDEST_PENDING_SEC", "webhook", "total", "PASS delivery_check=ok"} { + if !strings.Contains(text, want) { + t.Fatalf("text output missing %q:\n%s", want, text) + } + } + + var jsonOut bytes.Buffer + if err := renderDeliveryCheckReport(&jsonOut, report, "json"); err != nil { + t.Fatalf("renderDeliveryCheckReport(json): %v", err) + } + var decoded deliveryCheckReport + if err := json.Unmarshal(jsonOut.Bytes(), &decoded); err != nil { + t.Fatalf("json output did not decode: %v\n%s", err, jsonOut.String()) + } + if !decoded.OK || decoded.Host != "deliverer-1" || decoded.Total.DeliveredSince != 5 { + t.Fatalf("decoded json = %+v", decoded) + } + if decoded.Total.FailedSince != 1 || decoded.Total.OldestPendingAgeSec != 120 { + t.Fatalf("decoded json summary = %+v", decoded.Total) + } +} + +func TestRenderDeliveryCheckReportFailureText(t *testing.T) { + report := deliveryCheckReport{ + OK: false, + Host: "deliverer-1", + GeneratedAt: time.Date(2026, 4, 29, 18, 30, 0, 0, time.UTC), + Since: time.Date(2026, 4, 29, 18, 15, 0, 0, time.UTC), + Total: deliveryTableSummary{Kind: "total"}, + Failures: []string{"due deliveries total=1 exceeds max-due=0"}, + } + + var out bytes.Buffer + if err := renderDeliveryCheckReport(&out, report, "text"); err != nil { + t.Fatalf("renderDeliveryCheckReport(text): %v", err) + } + if !strings.Contains(out.String(), "FAIL due deliveries total=1 exceeds max-due=0") { + t.Fatalf("failure text missing:\n%s", out.String()) + } +} + +func expectDeliverySummaryQueries(t *testing.T, mock sqlmock.Sqlmock, table string, now, cutoff time.Time, summary deliveryTableSummary) { + t.Helper() + quotedTable := regexp.QuoteMeta(table) + mock.ExpectQuery(`(?s)MIN\(created_at\).*FROM ` + quotedTable + `.*WHERE status = 'pending'`). + WithArgs(now). + WillReturnRows(sqlmock.NewRows([]string{"count", "oldest_pending_age_sec"}). + AddRow(summary.Pending, summary.OldestPendingAgeSec)) + mock.ExpectQuery(`(?s)MIN\(COALESCE\(next_attempt_at, created_at\)\).*FROM `+quotedTable+`.*next_attempt_at IS NULL`). + WithArgs(now, now). + WillReturnRows(sqlmock.NewRows([]string{"count", "oldest_due_age_sec"}). + AddRow(summary.DueNow, summary.OldestDueAgeSec)) + mock.ExpectQuery(`(?s)FROM ` + quotedTable + `.*status = 'pending'.*next_attempt_at > \?`). + WithArgs(now). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(summary.FutureRetry)) + mock.ExpectQuery(`(?s)FROM ` + quotedTable + `.*status = 'delivered'.*delivered_at >= \?`). + WithArgs(cutoff). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(summary.DeliveredSince)) + mock.ExpectQuery(`(?s)FROM `+quotedTable+`.*status = \?.*last_attempt_at >= \?`). + WithArgs("abandoned", cutoff). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(summary.AbandonedSince)) + mock.ExpectQuery(`(?s)FROM `+quotedTable+`.*status = \?.*last_attempt_at IS NULL.*created_at >= \?`). + WithArgs("abandoned", cutoff). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + mock.ExpectQuery(`(?s)FROM `+quotedTable+`.*status = \?.*last_attempt_at >= \?`). + WithArgs("failed", cutoff). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(summary.FailedSince)) + mock.ExpectQuery(`(?s)FROM `+quotedTable+`.*status = \?.*last_attempt_at IS NULL.*created_at >= \?`). + WithArgs("failed", cutoff). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) +} diff --git a/cmd/jetmon-deliverer/main.go b/cmd/jetmon-deliverer/main.go new file mode 100644 index 00000000..f2e9b5a0 --- /dev/null +++ b/cmd/jetmon-deliverer/main.go @@ -0,0 +1,349 @@ +package main + +import ( + "context" + "database/sql" + "flag" + "fmt" + "io" + "log" + "os" + "os/signal" + "strings" + "syscall" + "time" + + "github.com/Automattic/jetmon/internal/audit" + "github.com/Automattic/jetmon/internal/config" + "github.com/Automattic/jetmon/internal/db" + "github.com/Automattic/jetmon/internal/deliverer" + "github.com/Automattic/jetmon/internal/fleethealth" + "github.com/Automattic/jetmon/internal/metrics" + "github.com/Automattic/jetmon/internal/processmetrics" +) + +const processHealthWriteTimeout = 2 * time.Second + +// Injected at build time via -ldflags. +var ( + version = "dev" + buildDate = "unknown" + goVersion = "unknown" +) + +func main() { + if len(os.Args) > 1 { + switch os.Args[1] { + case "version": + fmt.Printf("jetmon-deliverer %s (built %s with %s)\n", version, buildDate, goVersion) + return + case "validate-config": + cmdValidateConfig(os.Args[2:]) + return + case "delivery-check": + cmdDeliveryCheck(os.Args[2:]) + return + default: + fmt.Fprintf(os.Stderr, "unknown command %q (want: version, validate-config, delivery-check)\n", os.Args[1]) + os.Exit(2) + } + } + run() +} + +type delivererValidationOptions struct { + HostOverride string + RequireOwnerMatch bool + RequireEmailDelivery bool + RequireAPIDisabled bool +} + +func parseValidateConfigOptions(args []string) (delivererValidationOptions, error) { + var opts delivererValidationOptions + fs := flag.NewFlagSet("validate-config", flag.ContinueOnError) + fs.SetOutput(io.Discard) + fs.StringVar(&opts.HostOverride, "host", "", "host id to validate against DELIVERY_OWNER_HOST (default current hostname)") + fs.BoolVar(&opts.RequireOwnerMatch, "require-owner-match", false, "fail unless DELIVERY_OWNER_HOST exactly matches the validated host") + fs.BoolVar(&opts.RequireEmailDelivery, "require-email-delivery", false, "fail unless EMAIL_TRANSPORT is smtp or wpcom") + fs.BoolVar(&opts.RequireAPIDisabled, "require-api-disabled", false, "fail unless API_PORT is 0 in the deliverer config") + if err := fs.Parse(args); err != nil { + return opts, err + } + if fs.NArg() != 0 { + return opts, fmt.Errorf("unexpected argument %q", fs.Arg(0)) + } + return opts, nil +} + +func cmdValidateConfig(args []string) { + opts, err := parseValidateConfigOptions(args) + if err != nil { + fmt.Fprintf(os.Stderr, "usage: jetmon-deliverer validate-config [--host=] [--require-owner-match] [--require-email-delivery] [--require-api-disabled]\n") + fmt.Fprintf(os.Stderr, "FAIL %v\n", err) + os.Exit(2) + } + + configPath := envOrDefault("JETMON_CONFIG", "config/config.json") + if err := config.Load(configPath); err != nil { + fmt.Fprintf(os.Stderr, "FAIL config parse: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS config parse") + + config.LoadDB() + if err := db.ConnectWithRetry(3); err != nil { + fmt.Fprintf(os.Stderr, "FAIL db connect: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS db connect") + + cfg := config.Get() + hostID := strings.TrimSpace(opts.HostOverride) + if hostID == "" { + hostID = db.Hostname() + } + fmt.Printf("INFO deliverer_host=%q\n", hostID) + fmt.Printf("INFO email_transport=%s\n", emailTransportLabel(cfg)) + if !emailTransportDelivers(cfg) { + fmt.Printf("WARN email_transport=%s; alert-contact emails will be logged but not delivered\n", emailTransportLabel(cfg)) + } + if cfg.APIPort > 0 { + fmt.Printf("WARN api_port=%d; standalone deliverer ignores API_PORT, confirm this is a process-specific config\n", cfg.APIPort) + } else { + fmt.Println("PASS api_port=disabled") + } + if level, msg := deliveryOwnerStatus(cfg, hostID); msg != "" { + fmt.Printf("%s %s\n", level, msg) + } + failures := validateDelivererConfigRequirements(cfg, hostID, opts) + if len(failures) > 0 { + for _, failure := range failures { + fmt.Fprintf(os.Stderr, "FAIL %s\n", failure) + } + os.Exit(1) + } + + fmt.Println("\nvalidation passed") +} + +func run() { + configPath := envOrDefault("JETMON_CONFIG", "config/config.json") + if err := config.Load(configPath); err != nil { + log.Fatalf("load config: %v", err) + } + cfg := config.Get() + log.Printf("config: email_transport=%s", emailTransportLabel(cfg)) + if !emailTransportDelivers(cfg) { + log.Printf("WARN: email_transport=%s; alert-contact emails will be logged but not delivered", emailTransportLabel(cfg)) + } + + config.LoadDB() + if err := db.ConnectWithRetry(10); err != nil { + log.Fatalf("db connect: %v", err) + } + audit.Init(db.DB()) + + if err := metrics.Init("statsd:8125", db.Hostname()); err != nil { + log.Printf("warning: statsd init failed: %v", err) + } + + hostname := db.Hostname() + processStartedAt := time.Now().UTC() + processID := fleethealth.ProcessID(hostname, fleethealth.ProcessDeliverer) + workersEnabled := deliveryWorkersShouldStart(cfg, hostname) + publishProcessHealth := func(state string) { + snapshot := delivererProcessHealthSnapshot(hostname, processStartedAt, state, cfg, workersEnabled, delivererDependencyHealth(context.Background(), db.DB(), metrics.Global() != nil, time.Now().UTC())) + ctx, cancel := context.WithTimeout(context.Background(), processHealthWriteTimeout) + if err := fleethealth.Upsert(ctx, db.DB(), snapshot); err != nil { + log.Printf("process health: %v", err) + } + cancel() + } + if level, msg := deliveryOwnerStatus(cfg, hostname); msg != "" { + if level == "WARN" { + log.Printf("WARN: %s", msg) + } else { + log.Printf("config: %s", msg) + } + } + initialState := fleethealth.StateRunning + if !workersEnabled { + initialState = fleethealth.StateIdle + } + publishProcessHealth(initialState) + stopHealth := make(chan struct{}) + go func() { + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + for { + select { + case <-ticker.C: + publishProcessHealth(initialState) + case <-stopHealth: + return + } + } + }() + + if !workersEnabled { + waitForShutdown() + close(stopHealth) + publishProcessHealth(fleethealth.StateStopping) + ctx, cancel := context.WithTimeout(context.Background(), processHealthWriteTimeout) + if err := fleethealth.MarkStopped(ctx, db.DB(), processID, time.Now().UTC()); err != nil { + log.Printf("process health: %v", err) + } + cancel() + log.Println("jetmon-deliverer: shutdown complete") + return + } + + runtime := deliverer.Start(deliverer.Config{ + DB: db.DB(), + InstanceID: hostname, + Dispatchers: deliverer.BuildAlertDispatchers(cfg), + }) + waitForShutdown() + close(stopHealth) + publishProcessHealth(fleethealth.StateStopping) + runtime.Stop() + ctx, cancel := context.WithTimeout(context.Background(), processHealthWriteTimeout) + if err := fleethealth.MarkStopped(ctx, db.DB(), processID, time.Now().UTC()); err != nil { + log.Printf("process health: %v", err) + } + cancel() + log.Println("jetmon-deliverer: shutdown complete") +} + +func deliveryWorkersShouldStart(cfg *config.Config, hostname string) bool { + owner := strings.TrimSpace(cfg.DeliveryOwnerHost) + return owner == "" || owner == hostname +} + +func deliveryOwnerStatus(cfg *config.Config, hostname string) (string, string) { + owner := strings.TrimSpace(cfg.DeliveryOwnerHost) + if owner == "" { + return "WARN", fmt.Sprintf("delivery_owner_host is unset; standalone deliverer on host %q will run delivery workers", hostname) + } + if owner == hostname { + return "INFO", fmt.Sprintf("delivery_owner_host=%q matched; delivery workers enabled on this host", owner) + } + return "INFO", fmt.Sprintf("delivery_owner_host=%q; standalone deliverer idle on host %q", owner, hostname) +} + +func validateDelivererConfigRequirements(cfg *config.Config, hostname string, opts delivererValidationOptions) []string { + if cfg == nil { + return []string{"config is not loaded"} + } + hostID := strings.TrimSpace(hostname) + failures := []string{} + if opts.RequireOwnerMatch { + owner := strings.TrimSpace(cfg.DeliveryOwnerHost) + if hostID == "" { + failures = append(failures, "validated host id is empty") + } else if owner == "" { + failures = append(failures, fmt.Sprintf("DELIVERY_OWNER_HOST must be set to %q for single-owner deliverer rollout", hostID)) + } else if owner != hostID { + failures = append(failures, fmt.Sprintf("DELIVERY_OWNER_HOST=%q does not match deliverer host %q", owner, hostID)) + } + } + if opts.RequireEmailDelivery && !emailTransportDelivers(cfg) { + failures = append(failures, fmt.Sprintf("EMAIL_TRANSPORT=%q does not deliver email; set smtp or wpcom", emailTransportLabel(cfg))) + } + if opts.RequireAPIDisabled && cfg.APIPort > 0 { + failures = append(failures, fmt.Sprintf("API_PORT=%d must be 0 for standalone deliverer config", cfg.APIPort)) + } + return failures +} + +func delivererProcessHealthSnapshot(hostname string, startedAt time.Time, state string, cfg *config.Config, workersEnabled bool, health []fleethealth.DependencyHealth) fleethealth.Snapshot { + mem := processmetrics.CurrentMemory() + healthStatus := fleethealth.RollupHealthStatus(health) + if workersEnabled && strings.TrimSpace(cfg.DeliveryOwnerHost) == "" && healthStatus == fleethealth.HealthGreen { + healthStatus = fleethealth.HealthAmber + } + if state == fleethealth.StateStopping || state == fleethealth.StateStopped { + healthStatus = fleethealth.HealthAmber + } + return fleethealth.Snapshot{ + HostID: hostname, + ProcessType: fleethealth.ProcessDeliverer, + PID: os.Getpid(), + Version: version, + BuildDate: buildDate, + GoVersion: goVersion, + State: state, + HealthStatus: healthStatus, + StartedAt: startedAt, + UpdatedAt: time.Now().UTC(), + DeliveryWorkersEnabled: workersEnabled, + DeliveryOwnerHost: cfg.DeliveryOwnerHost, + GoSysMemMB: mem.GoSysMemMB, + RSSMemMB: mem.RSSMemMB, + DependencyHealth: health, + } +} + +func delivererDependencyHealth(ctx context.Context, sqlDB *sql.DB, statsdReady bool, checkedAt time.Time) []fleethealth.DependencyHealth { + return []fleethealth.DependencyHealth{ + delivererMySQLHealth(ctx, sqlDB, checkedAt), + delivererStatsDHealth(statsdReady, checkedAt), + } +} + +func delivererMySQLHealth(ctx context.Context, sqlDB *sql.DB, checkedAt time.Time) fleethealth.DependencyHealth { + entry := fleethealth.DependencyHealth{Name: "mysql", CheckedAt: checkedAt} + if sqlDB == nil { + entry.Status = "red" + entry.LastError = "database pool is not initialized" + return entry + } + pingCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + defer cancel() + start := time.Now() + if err := sqlDB.PingContext(pingCtx); err != nil { + entry.Status = "red" + entry.LatencyMS = time.Since(start).Milliseconds() + entry.LastError = err.Error() + return entry + } + entry.Status = "green" + entry.LatencyMS = time.Since(start).Milliseconds() + return entry +} + +func delivererStatsDHealth(ready bool, checkedAt time.Time) fleethealth.DependencyHealth { + entry := fleethealth.DependencyHealth{Name: "statsd", CheckedAt: checkedAt} + if !ready { + entry.Status = "amber" + entry.LastError = "statsd client is not initialized" + return entry + } + entry.Status = "green" + return entry +} + +func waitForShutdown() { + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) + sig := <-sigCh + log.Printf("received %s, stopping", sig) +} + +func emailTransportLabel(cfg *config.Config) string { + if cfg.EmailTransport == "" { + return "stub" + } + return cfg.EmailTransport +} + +func emailTransportDelivers(cfg *config.Config) bool { + return cfg.EmailTransport == "smtp" || cfg.EmailTransport == "wpcom" +} + +func envOrDefault(key, def string) string { + if v := os.Getenv(key); v != "" { + return v + } + return def +} diff --git a/cmd/jetmon-deliverer/main_test.go b/cmd/jetmon-deliverer/main_test.go new file mode 100644 index 00000000..a8816623 --- /dev/null +++ b/cmd/jetmon-deliverer/main_test.go @@ -0,0 +1,269 @@ +package main + +import ( + "context" + "strings" + "testing" + "time" + + "github.com/Automattic/jetmon/internal/config" + "github.com/Automattic/jetmon/internal/fleethealth" + "github.com/DATA-DOG/go-sqlmock" +) + +func TestDeliveryWorkersShouldStart(t *testing.T) { + tests := []struct { + name string + cfg config.Config + hostname string + wantStart bool + wantLevel string + wantMsg string + }{ + { + name: "empty owner starts with warning", + cfg: config.Config{}, + hostname: "host-a", + wantStart: true, + wantLevel: "WARN", + wantMsg: "delivery_owner_host is unset", + }, + { + name: "matching owner starts", + cfg: config.Config{ + DeliveryOwnerHost: "host-a", + }, + hostname: "host-a", + wantStart: true, + wantLevel: "INFO", + wantMsg: "matched", + }, + { + name: "non-owner idles", + cfg: config.Config{ + DeliveryOwnerHost: "host-a", + }, + hostname: "host-b", + wantLevel: "INFO", + wantMsg: "idle on host", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := deliveryWorkersShouldStart(&tt.cfg, tt.hostname); got != tt.wantStart { + t.Fatalf("deliveryWorkersShouldStart() = %v, want %v", got, tt.wantStart) + } + level, msg := deliveryOwnerStatus(&tt.cfg, tt.hostname) + if level != tt.wantLevel { + t.Fatalf("deliveryOwnerStatus() level = %q, want %q", level, tt.wantLevel) + } + if !strings.Contains(msg, tt.wantMsg) { + t.Fatalf("deliveryOwnerStatus() message = %q, want substring %q", msg, tt.wantMsg) + } + }) + } +} + +func TestParseValidateConfigOptions(t *testing.T) { + opts, err := parseValidateConfigOptions([]string{ + "--host=deliverer-1", + "--require-owner-match", + "--require-email-delivery", + "--require-api-disabled", + }) + if err != nil { + t.Fatalf("parseValidateConfigOptions: %v", err) + } + if opts.HostOverride != "deliverer-1" { + t.Fatalf("HostOverride = %q, want deliverer-1", opts.HostOverride) + } + if !opts.RequireOwnerMatch || !opts.RequireEmailDelivery || !opts.RequireAPIDisabled { + t.Fatalf("parsed options = %+v, want all requirements enabled", opts) + } + + if _, err := parseValidateConfigOptions([]string{"extra"}); err == nil { + t.Fatal("parseValidateConfigOptions accepted unexpected positional argument") + } +} + +func TestValidateDelivererConfigRequirements(t *testing.T) { + tests := []struct { + name string + cfg config.Config + hostname string + opts delivererValidationOptions + want []string + }{ + { + name: "single owner production config passes", + cfg: config.Config{ + DeliveryOwnerHost: "deliverer-1", + EmailTransport: "smtp", + }, + hostname: "deliverer-1", + opts: delivererValidationOptions{ + RequireOwnerMatch: true, + RequireEmailDelivery: true, + RequireAPIDisabled: true, + }, + }, + { + name: "owner required but empty", + cfg: config.Config{EmailTransport: "smtp"}, + hostname: "deliverer-1", + opts: delivererValidationOptions{RequireOwnerMatch: true}, + want: []string{"DELIVERY_OWNER_HOST must be set"}, + }, + { + name: "owner mismatch", + cfg: config.Config{ + DeliveryOwnerHost: "deliverer-2", + EmailTransport: "smtp", + }, + hostname: "deliverer-1", + opts: delivererValidationOptions{RequireOwnerMatch: true}, + want: []string{"does not match"}, + }, + { + name: "stub email rejected", + cfg: config.Config{ + DeliveryOwnerHost: "deliverer-1", + EmailTransport: "stub", + }, + hostname: "deliverer-1", + opts: delivererValidationOptions{RequireEmailDelivery: true}, + want: []string{"does not deliver email"}, + }, + { + name: "api port rejected", + cfg: config.Config{ + DeliveryOwnerHost: "deliverer-1", + EmailTransport: "smtp", + APIPort: 8090, + }, + hostname: "deliverer-1", + opts: delivererValidationOptions{RequireAPIDisabled: true}, + want: []string{"API_PORT=8090"}, + }, + { + name: "empty host rejected when owner must match", + cfg: config.Config{DeliveryOwnerHost: "deliverer-1"}, + hostname: " ", + opts: delivererValidationOptions{RequireOwnerMatch: true}, + want: []string{"host id is empty"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + failures := validateDelivererConfigRequirements(&tt.cfg, tt.hostname, tt.opts) + if len(tt.want) == 0 { + if len(failures) != 0 { + t.Fatalf("failures = %v, want none", failures) + } + return + } + if len(failures) != len(tt.want) { + t.Fatalf("failures = %v, want %d failures", failures, len(tt.want)) + } + for i, want := range tt.want { + if !strings.Contains(failures[i], want) { + t.Fatalf("failure[%d] = %q, want substring %q", i, failures[i], want) + } + } + }) + } +} + +func TestEmailTransportLabelAndDelivery(t *testing.T) { + tests := []struct { + name string + cfg config.Config + label string + delivers bool + }{ + {name: "empty is stub alias", cfg: config.Config{}, label: "stub"}, + {name: "stub logs only", cfg: config.Config{EmailTransport: "stub"}, label: "stub"}, + {name: "smtp delivers", cfg: config.Config{EmailTransport: "smtp"}, label: "smtp", delivers: true}, + {name: "wpcom delivers", cfg: config.Config{EmailTransport: "wpcom"}, label: "wpcom", delivers: true}, + {name: "unknown does not deliver", cfg: config.Config{EmailTransport: "sendmail"}, label: "sendmail"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := emailTransportLabel(&tt.cfg); got != tt.label { + t.Fatalf("emailTransportLabel() = %q, want %q", got, tt.label) + } + if got := emailTransportDelivers(&tt.cfg); got != tt.delivers { + t.Fatalf("emailTransportDelivers() = %v, want %v", got, tt.delivers) + } + }) + } +} + +func TestDelivererProcessHealthSnapshot(t *testing.T) { + started := time.Date(2026, 4, 30, 11, 0, 0, 0, time.UTC) + cfg := &config.Config{DeliveryOwnerHost: "deliverer-1"} + snapshot := delivererProcessHealthSnapshot("deliverer-1", started, fleethealth.StateRunning, cfg, true, []fleethealth.DependencyHealth{{ + Name: "mysql", + Status: "green", + CheckedAt: started, + }}) + + if snapshot.HostID != "deliverer-1" { + t.Fatalf("HostID = %q, want deliverer-1", snapshot.HostID) + } + if snapshot.ProcessType != fleethealth.ProcessDeliverer { + t.Fatalf("ProcessType = %q, want deliverer", snapshot.ProcessType) + } + if !snapshot.DeliveryWorkersEnabled { + t.Fatal("DeliveryWorkersEnabled = false, want true") + } + if snapshot.DeliveryOwnerHost != "deliverer-1" { + t.Fatalf("DeliveryOwnerHost = %q, want deliverer-1", snapshot.DeliveryOwnerHost) + } + if snapshot.HealthStatus != fleethealth.HealthGreen { + t.Fatalf("HealthStatus = %q, want green", snapshot.HealthStatus) + } + if len(snapshot.DependencyHealth) != 1 { + t.Fatalf("DependencyHealth len = %d, want 1", len(snapshot.DependencyHealth)) + } +} + +func TestDelivererDependencyHealth(t *testing.T) { + sqlDB, mock, err := sqlmock.New(sqlmock.MonitorPingsOption(true)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer sqlDB.Close() + mock.ExpectPing() + + checkedAt := time.Date(2026, 4, 30, 11, 1, 0, 0, time.UTC) + entries := delivererDependencyHealth(context.Background(), sqlDB, false, checkedAt) + if len(entries) != 2 { + t.Fatalf("entries len = %d, want 2", len(entries)) + } + if entries[0].Name != "mysql" || entries[0].Status != "green" { + t.Fatalf("mysql entry = %+v, want green", entries[0]) + } + if entries[1].Name != "statsd" || entries[1].Status != "amber" { + t.Fatalf("statsd entry = %+v, want amber", entries[1]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Fatalf("sql expectations: %v", err) + } +} + +func TestEnvOrDefault(t *testing.T) { + const key = "JETMON_DELIVERER_TEST_ENV_OR_DEFAULT" + t.Setenv(key, "") + if got := envOrDefault(key, "fallback"); got != "fallback" { + t.Fatalf("envOrDefault() = %q, want fallback", got) + } + + t.Setenv(key, "set-value") + if got := envOrDefault(key, "fallback"); got != "set-value" { + t.Fatalf("envOrDefault() = %q, want set-value", got) + } +} diff --git a/cmd/jetmon-testsite/main.go b/cmd/jetmon-testsite/main.go new file mode 100644 index 00000000..39a8a3b8 --- /dev/null +++ b/cmd/jetmon-testsite/main.go @@ -0,0 +1,309 @@ +package main + +import ( + "context" + "crypto/hmac" + "crypto/rand" + "crypto/rsa" + "crypto/sha256" + "crypto/tls" + "crypto/x509" + "crypto/x509/pkix" + "encoding/hex" + "encoding/json" + "encoding/pem" + "errors" + "fmt" + "io" + "log" + "math/big" + "net" + "net/http" + "os" + "os/signal" + "strconv" + "strings" + "sync" + "syscall" + "time" +) + +const ( + defaultHTTPAddr = ":8091" + defaultHTTPSAddr = ":8443" +) + +func main() { + if len(os.Args) > 1 && os.Args[1] == "healthcheck" { + if err := healthcheck(); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } + return + } + + httpAddr := envOrDefault("FIXTURE_HTTP_ADDR", defaultHTTPAddr) + httpsAddr := envOrDefault("FIXTURE_HTTPS_ADDR", defaultHTTPSAddr) + handler := newFixtureHandler() + + servers := []*http.Server{{ + Addr: httpAddr, + Handler: handler, + }} + if httpsAddr != "" { + cert, err := selfSignedCert() + if err != nil { + log.Fatalf("generate tls cert: %v", err) + } + servers = append(servers, &http.Server{ + Addr: httpsAddr, + Handler: handler, + TLSConfig: &tls.Config{Certificates: []tls.Certificate{cert}, MinVersion: tls.VersionTLS12}, + }) + } + + errCh := make(chan error, len(servers)) + for _, srv := range servers { + srv := srv + go func() { + log.Printf("jetmon-testsite: listening on %s", srv.Addr) + var err error + if srv.TLSConfig != nil { + err = srv.ListenAndServeTLS("", "") + } else { + err = srv.ListenAndServe() + } + if err != nil && !errors.Is(err, http.ErrServerClosed) { + errCh <- err + } + }() + } + + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) + select { + case sig := <-sigCh: + log.Printf("jetmon-testsite: shutdown signal=%s", sig) + case err := <-errCh: + log.Printf("jetmon-testsite: server error: %v", err) + } + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + for _, srv := range servers { + if err := srv.Shutdown(ctx); err != nil { + log.Printf("jetmon-testsite: shutdown %s: %v", srv.Addr, err) + } + } +} + +func newFixtureHandler() http.Handler { + mux := http.NewServeMux() + webhooks := &fixtureWebhookReceiver{} + mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + _, _ = io.WriteString(w, "ok\n") + }) + mux.HandleFunc("/webhook", webhooks.handleWebhook) + mux.HandleFunc("/webhook/requests", webhooks.handleRequests) + mux.HandleFunc("/ok", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + _, _ = io.WriteString(w, "jetmon fixture ok\n") + }) + mux.HandleFunc("/tls", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + _, _ = io.WriteString(w, "jetmon fixture tls endpoint\n") + }) + mux.HandleFunc("/keyword", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + _, _ = io.WriteString(w, "jetmon fixture keyword present\n") + }) + mux.HandleFunc("/redirect", func(w http.ResponseWriter, r *http.Request) { + http.Redirect(w, r, "/ok", http.StatusFound) + }) + mux.HandleFunc("/slow", func(w http.ResponseWriter, r *http.Request) { + delay := fixtureDelay(r.URL.Query().Get("delay"), 5*time.Second) + time.Sleep(delay) + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + fmt.Fprintf(w, "slow response after %s\n", delay) + }) + mux.HandleFunc("/status/", func(w http.ResponseWriter, r *http.Request) { + raw := strings.TrimPrefix(r.URL.Path, "/status/") + code, err := strconv.Atoi(raw) + if err != nil || code < 100 || code > 599 { + http.Error(w, "status must be 100-599", http.StatusBadRequest) + return + } + w.WriteHeader(code) + if code != http.StatusNoContent && code != http.StatusNotModified { + fmt.Fprintf(w, "status %d\n", code) + } + }) + return mux +} + +type fixtureWebhookReceiver struct { + mu sync.Mutex + nextID int + requests []fixtureWebhookRequest +} + +type fixtureWebhookRequest struct { + ID int `json:"id"` + ReceivedAt string `json:"received_at"` + Event string `json:"event,omitempty"` + Delivery string `json:"delivery,omitempty"` + Signature string `json:"signature,omitempty"` + SignatureValid *bool `json:"signature_valid,omitempty"` + Body string `json:"body"` +} + +func (f *fixtureWebhookReceiver) handleWebhook(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + body, err := io.ReadAll(http.MaxBytesReader(w, r.Body, 1<<20)) + if err != nil { + http.Error(w, "read body: "+err.Error(), http.StatusBadRequest) + return + } + signature := r.Header.Get("X-Jetmon-Signature") + var signatureValid *bool + if secret := r.URL.Query().Get("secret"); secret != "" { + valid := verifyJetmonSignature(signature, body, secret) + signatureValid = &valid + } + + f.mu.Lock() + f.nextID++ + f.requests = append(f.requests, fixtureWebhookRequest{ + ID: f.nextID, + ReceivedAt: time.Now().UTC().Format(time.RFC3339Nano), + Event: r.Header.Get("X-Jetmon-Event"), + Delivery: r.Header.Get("X-Jetmon-Delivery"), + Signature: signature, + SignatureValid: signatureValid, + Body: string(body), + }) + f.mu.Unlock() + + w.WriteHeader(http.StatusNoContent) +} + +func (f *fixtureWebhookReceiver) handleRequests(w http.ResponseWriter, r *http.Request) { + switch r.Method { + case http.MethodGet: + f.mu.Lock() + requests := append([]fixtureWebhookRequest(nil), f.requests...) + f.mu.Unlock() + writeFixtureJSON(w, map[string]any{ + "count": len(requests), + "requests": requests, + }) + case http.MethodDelete: + f.mu.Lock() + f.nextID = 0 + f.requests = nil + f.mu.Unlock() + w.WriteHeader(http.StatusNoContent) + default: + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + } +} + +func verifyJetmonSignature(signature string, body []byte, secret string) bool { + var timestamp string + var got string + for _, part := range strings.Split(signature, ",") { + k, v, ok := strings.Cut(part, "=") + if !ok { + continue + } + switch k { + case "t": + timestamp = v + case "v1": + got = v + } + } + if timestamp == "" || got == "" { + return false + } + mac := hmac.New(sha256.New, []byte(secret)) + _, _ = mac.Write([]byte(timestamp)) + _, _ = mac.Write([]byte(".")) + _, _ = mac.Write(body) + want := hex.EncodeToString(mac.Sum(nil)) + return hmac.Equal([]byte(got), []byte(want)) +} + +func writeFixtureJSON(w http.ResponseWriter, v any) { + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(v); err != nil { + log.Printf("jetmon-testsite: encode json: %v", err) + } +} + +func fixtureDelay(raw string, fallback time.Duration) time.Duration { + if raw == "" { + return fallback + } + delay, err := time.ParseDuration(raw) + if err != nil || delay < 0 { + return fallback + } + if delay > 30*time.Second { + return 30 * time.Second + } + return delay +} + +func selfSignedCert() (tls.Certificate, error) { + key, err := rsa.GenerateKey(rand.Reader, 2048) + if err != nil { + return tls.Certificate{}, err + } + serial, err := rand.Int(rand.Reader, new(big.Int).Lsh(big.NewInt(1), 128)) + if err != nil { + return tls.Certificate{}, err + } + tmpl := x509.Certificate{ + SerialNumber: serial, + Subject: pkix.Name{CommonName: "jetmon-testsite"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(24 * time.Hour), + KeyUsage: x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + DNSNames: []string{"localhost", "api-fixture", "jetmon-testsite"}, + IPAddresses: []net.IP{net.ParseIP("127.0.0.1")}, + } + certDER, err := x509.CreateCertificate(rand.Reader, &tmpl, &tmpl, &key.PublicKey, key) + if err != nil { + return tls.Certificate{}, err + } + keyDER := x509.MarshalPKCS1PrivateKey(key) + certPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: certDER}) + keyPEM := pem.EncodeToMemory(&pem.Block{Type: "RSA PRIVATE KEY", Bytes: keyDER}) + return tls.X509KeyPair(certPEM, keyPEM) +} + +func healthcheck() error { + client := &http.Client{Timeout: 2 * time.Second} + resp, err := client.Get("http://127.0.0.1" + envOrDefault("FIXTURE_HEALTH_PORT", ":8091") + "/health") + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return fmt.Errorf("health returned %s", resp.Status) + } + return nil +} + +func envOrDefault(name, fallback string) string { + if v := os.Getenv(name); v != "" { + return v + } + return fallback +} diff --git a/cmd/jetmon-testsite/main_test.go b/cmd/jetmon-testsite/main_test.go new file mode 100644 index 00000000..1bdbd6db --- /dev/null +++ b/cmd/jetmon-testsite/main_test.go @@ -0,0 +1,149 @@ +package main + +import ( + "bytes" + "crypto/hmac" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +func TestFixtureHandlerEndpoints(t *testing.T) { + srv := httptest.NewServer(newFixtureHandler()) + defer srv.Close() + + tests := []struct { + path string + code int + body string + }{ + {path: "/health", code: http.StatusOK, body: "ok"}, + {path: "/ok", code: http.StatusOK, body: "fixture ok"}, + {path: "/keyword", code: http.StatusOK, body: "keyword present"}, + {path: "/status/403", code: http.StatusForbidden, body: "status 403"}, + {path: "/status/500", code: http.StatusInternalServerError, body: "status 500"}, + } + for _, tt := range tests { + t.Run(tt.path, func(t *testing.T) { + resp, err := http.Get(srv.URL + tt.path) + if err != nil { + t.Fatalf("GET %s: %v", tt.path, err) + } + defer resp.Body.Close() + if resp.StatusCode != tt.code { + t.Fatalf("status = %d, want %d", resp.StatusCode, tt.code) + } + buf := make([]byte, 256) + n, _ := resp.Body.Read(buf) + if !strings.Contains(string(buf[:n]), tt.body) { + t.Fatalf("body = %q, want substring %q", string(buf[:n]), tt.body) + } + }) + } +} + +func TestFixtureRedirectAndDelay(t *testing.T) { + srv := httptest.NewServer(newFixtureHandler()) + defer srv.Close() + + client := &http.Client{CheckRedirect: func(*http.Request, []*http.Request) error { + return http.ErrUseLastResponse + }} + resp, err := client.Get(srv.URL + "/redirect") + if err != nil { + t.Fatalf("GET redirect: %v", err) + } + resp.Body.Close() + if resp.StatusCode != http.StatusFound || resp.Header.Get("Location") != "/ok" { + t.Fatalf("redirect status=%d location=%q", resp.StatusCode, resp.Header.Get("Location")) + } + + start := time.Now() + resp, err = http.Get(srv.URL + "/slow?delay=10ms") + if err != nil { + t.Fatalf("GET slow: %v", err) + } + resp.Body.Close() + if elapsed := time.Since(start); elapsed < 10*time.Millisecond { + t.Fatalf("slow endpoint returned too quickly: %s", elapsed) + } +} + +func TestFixtureWebhookReceiverRecordsAndVerifiesSignature(t *testing.T) { + srv := httptest.NewServer(newFixtureHandler()) + defer srv.Close() + + secret := "whsec_test_secret" + body := []byte(`{"type":"event.opened"}`) + req, err := http.NewRequest(http.MethodPost, srv.URL+"/webhook?secret="+secret, bytes.NewReader(body)) + if err != nil { + t.Fatalf("new request: %v", err) + } + req.Header.Set("X-Jetmon-Event", "event.opened") + req.Header.Set("X-Jetmon-Delivery", "123") + req.Header.Set("X-Jetmon-Signature", fixtureTestSignature(1700000000, body, secret)) + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("POST webhook: %v", err) + } + resp.Body.Close() + if resp.StatusCode != http.StatusNoContent { + t.Fatalf("POST status = %d, want 204", resp.StatusCode) + } + + resp, err = http.Get(srv.URL + "/webhook/requests") + if err != nil { + t.Fatalf("GET webhook requests: %v", err) + } + defer resp.Body.Close() + var got struct { + Count int `json:"count"` + Requests []struct { + Event string `json:"event"` + Delivery string `json:"delivery"` + SignatureValid *bool `json:"signature_valid"` + Body string `json:"body"` + } `json:"requests"` + } + if err := json.NewDecoder(resp.Body).Decode(&got); err != nil { + t.Fatalf("decode webhook requests: %v", err) + } + if got.Count != 1 || len(got.Requests) != 1 { + t.Fatalf("requests = %+v, want one", got) + } + if got.Requests[0].Event != "event.opened" || got.Requests[0].Delivery != "123" { + t.Fatalf("request headers = %+v", got.Requests[0]) + } + if got.Requests[0].SignatureValid == nil || !*got.Requests[0].SignatureValid { + t.Fatalf("signature_valid = %v, want true", got.Requests[0].SignatureValid) + } + if got.Requests[0].Body != string(body) { + t.Fatalf("body = %q, want %q", got.Requests[0].Body, string(body)) + } + + req, err = http.NewRequest(http.MethodDelete, srv.URL+"/webhook/requests", nil) + if err != nil { + t.Fatalf("new delete request: %v", err) + } + resp, err = http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("DELETE webhook requests: %v", err) + } + resp.Body.Close() + if resp.StatusCode != http.StatusNoContent { + t.Fatalf("DELETE status = %d, want 204", resp.StatusCode) + } +} + +func fixtureTestSignature(ts int64, body []byte, secret string) string { + mac := hmac.New(sha256.New, []byte(secret)) + _, _ = mac.Write([]byte(fmt.Sprintf("%d.", ts))) + _, _ = mac.Write(body) + return fmt.Sprintf("t=%d,v1=%s", ts, hex.EncodeToString(mac.Sum(nil))) +} diff --git a/cmd/jetmon2/api_cli.go b/cmd/jetmon2/api_cli.go new file mode 100644 index 00000000..8f73f712 --- /dev/null +++ b/cmd/jetmon2/api_cli.go @@ -0,0 +1,858 @@ +package main + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "flag" + "fmt" + "io" + "net" + "net/http" + "net/url" + "os" + "sort" + "strconv" + "strings" + "text/tabwriter" + "time" +) + +const defaultAPIBaseURL = "http://localhost:8090" +const defaultAPIAuthPolicy = "same-origin" + +type apiCLIOptions struct { + baseURL string + token string + authPolicy string + allowRemote bool + verbose bool + pretty bool + output string + timeout time.Duration + body string + bodyFile string + idempotencyKey string + headers apiHeaderFlags + out io.Writer + errOut io.Writer + in io.Reader + commandName string +} + +type apiHeaderFlags []string + +type apiHTTPResponse struct { + StatusCode int + Status string + Body []byte +} + +type apiCommandInfo struct { + Command string `json:"command"` + Description string `json:"description"` + Example string `json:"example"` +} + +var apiCommandCatalog = []apiCommandInfo{ + {Command: "health", Description: "check API and database health", Example: "jetmon2 api health --pretty"}, + {Command: "me", Description: "show the authenticated API key identity", Example: "jetmon2 api me --pretty"}, + {Command: "request", Description: "send an arbitrary request to an API path", Example: "jetmon2 api request --output table GET /api/v1/sites"}, + {Command: "sites list", Description: "list monitored sites with filters", Example: "jetmon2 api sites list --limit 20 --output table"}, + {Command: "sites get", Description: "show one monitored site", Example: "jetmon2 api sites get 12345 --pretty"}, + {Command: "sites create", Description: "create a monitored site", Example: "jetmon2 api sites create --blog-id 12345 --url https://example.com --pretty"}, + {Command: "sites update", Description: "update check settings for a site", Example: "jetmon2 api sites update 12345 --url https://example.com/health --pretty"}, + {Command: "sites delete", Description: "delete a monitored site", Example: "jetmon2 api sites delete 12345"}, + {Command: "sites pause", Description: "pause monitoring for a site", Example: "jetmon2 api sites pause 12345 --idempotency-key site-12345-pause"}, + {Command: "sites resume", Description: "resume monitoring for a site", Example: "jetmon2 api sites resume 12345 --idempotency-key site-12345-resume"}, + {Command: "sites trigger-now", Description: "run an immediate check", Example: "jetmon2 api sites trigger-now 12345 --pretty"}, + {Command: "sites bulk-add", Description: "create bounded local test-site batches", Example: "jetmon2 api sites bulk-add --count 3 --batch local-smoke --dry-run --pretty"}, + {Command: "sites cleanup", Description: "delete deterministic CLI-created site batches", Example: "jetmon2 api sites cleanup --batch local-smoke --count 3 --output table"}, + {Command: "sites simulate-failure", Description: "mutate test sites into known failure modes", Example: "jetmon2 api sites simulate-failure --batch local-smoke --mode http-500 --wait 30s --output table"}, + {Command: "events list", Description: "list events for a site", Example: "jetmon2 api events list 12345 --active=true --output table"}, + {Command: "events get", Description: "show one event", Example: "jetmon2 api events get --site-id 12345 98765 --pretty"}, + {Command: "events transitions", Description: "list event transition history", Example: "jetmon2 api events transitions 12345 98765 --output table"}, + {Command: "events close", Description: "manually close an event", Example: "jetmon2 api events close 12345 98765 --reason manual_override --pretty"}, + {Command: "webhooks list", Description: "list webhook registrations", Example: "jetmon2 api webhooks list --output table"}, + {Command: "webhooks create", Description: "create a webhook registration", Example: "jetmon2 api webhooks create --url https://receiver.example.test/jetmon --event event.opened --pretty"}, + {Command: "webhooks deliveries", Description: "list webhook delivery rows", Example: "jetmon2 api webhooks deliveries 77 --status failed --output table"}, + {Command: "webhooks retry", Description: "retry an abandoned webhook delivery", Example: "jetmon2 api webhooks retry 77 555 --idempotency-key webhook-77-555-retry --pretty"}, + {Command: "alert-contacts list", Description: "list managed alert contacts", Example: "jetmon2 api alert-contacts list --output table"}, + {Command: "alert-contacts create", Description: "create an email, PagerDuty, Slack, or Teams contact", Example: "jetmon2 api alert-contacts create --label Local --transport email --address alerts@example.test --pretty"}, + {Command: "alert-contacts test", Description: "send a managed alert-contact test", Example: "jetmon2 api alert-contacts test 12 --idempotency-key alert-12-test --pretty"}, + {Command: "alert-contacts deliveries", Description: "list managed alert delivery rows", Example: "jetmon2 api alert-contacts deliveries 12 --status failed --output table"}, + {Command: "smoke", Description: "run the Docker-local API smoke workflow", Example: "jetmon2 api smoke --batch local-smoke --exercise webhook --pretty"}, + {Command: "commands", Description: "list API CLI commands and examples", Example: "jetmon2 api commands --output table"}, +} + +func (h *apiHeaderFlags) String() string { + return strings.Join(*h, ",") +} + +func (h *apiHeaderFlags) Set(v string) error { + if !strings.Contains(v, ":") { + return fmt.Errorf("header %q must be in Name: Value form", v) + } + *h = append(*h, v) + return nil +} + +func cmdAPI(args []string) { + if len(args) == 0 { + printAPIUsage(os.Stderr) + os.Exit(1) + } + + sub := args[0] + rest := args[1:] + var err error + switch sub { + case "health": + err = cmdAPIHealth(rest) + case "me": + err = cmdAPIMe(rest) + case "request": + err = cmdAPIRequest(rest) + case "commands": + err = cmdAPICommands(rest) + case "sites": + err = cmdAPISites(rest) + case "events": + err = cmdAPIEvents(rest) + case "webhooks": + err = cmdAPIWebhooks(rest) + case "alert-contacts": + err = cmdAPIAlertContacts(rest) + case "smoke": + err = cmdAPISmoke(rest) + default: + fmt.Fprintf(os.Stderr, "unknown api subcommand %q (want: health, me, request, commands, sites, events, webhooks, alert-contacts, smoke)\n", sub) + printAPIUsage(os.Stderr) + os.Exit(1) + } + if err != nil { + logAPIErrorAndExit(err) + } +} + +func printAPIUsage(w io.Writer) { + fmt.Fprintln(w, "usage: jetmon2 api [flags]") + fmt.Fprintln(w) + fmt.Fprintln(w, "Run `jetmon2 api commands --output table` for the command catalog.") + fmt.Fprintln(w) + fmt.Fprintln(w, "Environment:") + fmt.Fprintln(w, " JETMON_API_URL API base URL (default: http://localhost:8090)") + fmt.Fprintln(w, " JETMON_API_TOKEN Bearer token for authenticated routes") + fmt.Fprintln(w, " JETMON_API_AUTH_POLICY automatic auth policy: same-origin or any-origin (default: same-origin)") +} + +func cmdAPIHealth(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api health", &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return fmt.Errorf("usage: jetmon2 api health [flags]") + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, "/api/v1/health", nil) +} + +func cmdAPIMe(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api me", &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return fmt.Errorf("usage: jetmon2 api me [flags]") + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, "/api/v1/me", nil) +} + +func cmdAPIRequest(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api request", &opts) + fs.StringVar(&opts.body, "body", "", "literal request body") + fs.StringVar(&opts.bodyFile, "body-file", "", "file containing request body (- reads stdin)") + fs.StringVar(&opts.idempotencyKey, "idempotency-key", "", "Idempotency-Key header for POST retries") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 2 { + return fmt.Errorf("usage: jetmon2 api request [flags] ") + } + + body, err := readAPIRequestBody(opts) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, fs.Arg(0), fs.Arg(1), body) +} + +func cmdAPICommands(args []string) error { + opts := defaultAPIOptions() + opts.output = "table" + fs := newAPIFlagSet("api commands", &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return fmt.Errorf("usage: jetmon2 api commands [flags]") + } + return writeAPICommands(opts) +} + +func writeAPICommands(opts apiCLIOptions) error { + return writeAPIValueOutput(opts.out, map[string]any{"commands": apiCommandCatalog}, opts) +} + +func defaultAPIOptions() apiCLIOptions { + return apiCLIOptions{ + baseURL: envOrDefault("JETMON_API_URL", defaultAPIBaseURL), + token: os.Getenv("JETMON_API_TOKEN"), + authPolicy: envOrDefault("JETMON_API_AUTH_POLICY", defaultAPIAuthPolicy), + timeout: 10 * time.Second, + out: os.Stdout, + errOut: os.Stderr, + in: os.Stdin, + } +} + +func newAPIFlagSet(name string, opts *apiCLIOptions) *flag.FlagSet { + opts.commandName = name + fs := flag.NewFlagSet(name, flag.ContinueOnError) + fs.SetOutput(opts.errOut) + fs.StringVar(&opts.baseURL, "base-url", opts.baseURL, "API base URL") + fs.StringVar(&opts.token, "token", opts.token, "Bearer token") + if tokenFlag := fs.Lookup("token"); tokenFlag != nil { + tokenFlag.DefValue = "" + } + fs.StringVar(&opts.authPolicy, "auth-policy", opts.authPolicy, "automatic auth policy: same-origin or any-origin") + fs.BoolVar(&opts.allowRemote, "allow-remote", opts.allowRemote, "allow writes to a non-local API base URL") + fs.BoolVar(&opts.verbose, "v", false, "print request and response headers to stderr") + fs.BoolVar(&opts.verbose, "verbose", false, "print request and response headers to stderr") + fs.BoolVar(&opts.pretty, "pretty", false, "pretty-print JSON response bodies") + defaultOutput := opts.output + if defaultOutput == "" { + defaultOutput = "json" + } + fs.StringVar(&opts.output, "output", defaultOutput, "response output format: json or table") + fs.DurationVar(&opts.timeout, "timeout", opts.timeout, "request timeout") + fs.Var(&opts.headers, "header", "additional request header in Name: Value form (repeatable)") + fs.Usage = func() { + printAPIFlagUsage(fs.Output(), fs) + } + return fs +} + +type apiBoolFlag interface { + IsBoolFlag() bool +} + +func parseAPIFlags(fs *flag.FlagSet, args []string) error { + normalized := normalizeAPIFlagArgs(fs, args) + return fs.Parse(normalized) +} + +func normalizeAPIFlagArgs(fs *flag.FlagSet, args []string) []string { + flags := []string{} + positionals := []string{} + onlyPositionals := false + hasTerminator := false + for i := 0; i < len(args); i++ { + arg := args[i] + if onlyPositionals || arg == "-" || !strings.HasPrefix(arg, "-") { + positionals = append(positionals, arg) + continue + } + if arg == "--" { + onlyPositionals = true + hasTerminator = true + continue + } + + name, hasValue := apiFlagName(arg) + f := fs.Lookup(name) + if f == nil { + flags = append(flags, arg) + continue + } + flags = append(flags, arg) + if hasValue || apiFlagIsBool(f) { + continue + } + if i+1 < len(args) { + i++ + flags = append(flags, args[i]) + } + } + if hasTerminator { + flags = append(flags, "--") + } + return append(flags, positionals...) +} + +func apiFlagName(arg string) (string, bool) { + name := strings.TrimLeft(arg, "-") + if idx := strings.IndexByte(name, '='); idx >= 0 { + return name[:idx], true + } + return name, false +} + +func apiFlagIsBool(f *flag.Flag) bool { + bf, ok := f.Value.(apiBoolFlag) + return ok && bf.IsBoolFlag() +} + +func printAPIFlagUsage(w io.Writer, fs *flag.FlagSet) { + fmt.Fprintf(w, "Usage of %s:\n", fs.Name()) + printAPIFlagDefaults(w, fs) +} + +func printAPIFlagDefaults(w io.Writer, fs *flag.FlagSet) { + flags := []*flag.Flag{} + fs.VisitAll(func(f *flag.Flag) { + flags = append(flags, f) + }) + sort.Slice(flags, func(i, j int) bool { + return flags[i].Name < flags[j].Name + }) + + for _, f := range flags { + valueName, usage := flag.UnquoteUsage(f) + prefix := "--" + if len(f.Name) == 1 { + prefix = "-" + } + fmt.Fprintf(w, " %s%s", prefix, f.Name) + if valueName != "" { + fmt.Fprintf(w, " %s", valueName) + } + fmt.Fprintf(w, "\n \t%s", usage) + if defaultValue := apiFlagDefaultValue(f, valueName); defaultValue != "" { + fmt.Fprintf(w, " (default %s)", defaultValue) + } + fmt.Fprintln(w) + } +} + +func apiFlagDefaultValue(f *flag.Flag, valueName string) string { + if f.DefValue == "" || f.DefValue == "0" || f.DefValue == "0s" || f.DefValue == "false" { + return "" + } + if valueName == "string" { + return strconv.Quote(f.DefValue) + } + return f.DefValue +} + +func readAPIRequestBody(opts apiCLIOptions) ([]byte, error) { + if opts.body != "" && opts.bodyFile != "" { + return nil, errors.New("use --body or --body-file, not both") + } + if opts.body != "" { + return []byte(opts.body), nil + } + if opts.bodyFile == "" { + return nil, nil + } + if opts.bodyFile == "-" { + return io.ReadAll(opts.in) + } + return os.ReadFile(opts.bodyFile) +} + +func executeAPIRequest(ctx context.Context, client *http.Client, opts apiCLIOptions, method, target string, body []byte) error { + if opts.out == nil { + opts.out = io.Discard + } + if err := validateAPIOutputFormat(opts.output); err != nil { + return err + } + resp, err := doAPIRequest(ctx, client, opts, method, target, body) + if err != nil { + return err + } + if err := writeAPIOutput(opts.out, resp.Body, opts); err != nil { + return err + } + if resp.StatusCode >= 400 { + return fmt.Errorf("api returned %s", resp.Status) + } + return nil +} + +func doAPIRequest(ctx context.Context, client *http.Client, opts apiCLIOptions, method, target string, body []byte) (apiHTTPResponse, error) { + if opts.errOut == nil { + opts.errOut = io.Discard + } + if opts.timeout <= 0 { + opts.timeout = 10 * time.Second + } + if client == nil { + client = &http.Client{Timeout: opts.timeout} + } + + requestURL, err := apiRequestURL(opts.baseURL, target) + if err != nil { + return apiHTTPResponse{}, err + } + if apiMethodRequiresRemoteWriteGuard(method) { + if _, err := requireAPILocalURLOrAllowRemote(requestURL, opts.allowRemote, apiRemoteGuardCommand(opts)); err != nil { + return apiHTTPResponse{}, err + } + } + + var bodyReader io.Reader + if len(body) > 0 { + bodyReader = bytes.NewReader(body) + } + req, err := http.NewRequestWithContext(ctx, strings.ToUpper(method), requestURL, bodyReader) + if err != nil { + return apiHTTPResponse{}, err + } + sendAutoAuth, err := shouldSendAPIAutoAuth(opts.baseURL, requestURL, opts.authPolicy) + if err != nil { + return apiHTTPResponse{}, err + } + applyAPIRequestHeaders(req, opts, len(body) > 0, sendAutoAuth) + + if opts.verbose { + writeAPIRequestHeaders(opts.errOut, req) + } + + resp, err := client.Do(req) + if err != nil { + return apiHTTPResponse{}, err + } + defer resp.Body.Close() + + if opts.verbose { + writeAPIResponseHeaders(opts.errOut, resp) + } + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return apiHTTPResponse{}, err + } + return apiHTTPResponse{ + StatusCode: resp.StatusCode, + Status: resp.Status, + Body: respBody, + }, nil +} + +func apiRequestURL(baseURL, target string) (string, error) { + if strings.TrimSpace(target) == "" { + return "", errors.New("request path is required") + } + if u, err := url.Parse(target); err == nil && u.IsAbs() { + return u.String(), nil + } + + base, err := url.Parse(strings.TrimRight(baseURL, "/")) + if err != nil { + return "", fmt.Errorf("invalid API base URL %q: %w", baseURL, err) + } + if !base.IsAbs() || base.Host == "" { + return "", fmt.Errorf("invalid API base URL %q: must include scheme and host", baseURL) + } + rel, err := url.Parse(target) + if err != nil { + return "", fmt.Errorf("invalid API path %q: %w", target, err) + } + if !strings.HasPrefix(rel.Path, "/") { + rel.Path = "/" + rel.Path + } + return base.ResolveReference(rel).String(), nil +} + +func shouldSendAPIAutoAuth(baseURL, requestURL, policy string) (bool, error) { + policy = strings.ToLower(strings.TrimSpace(policy)) + if policy == "" { + policy = defaultAPIAuthPolicy + } + switch policy { + case "any-origin": + return true, nil + case "same-origin": + base, err := url.Parse(strings.TrimRight(baseURL, "/")) + if err != nil { + return false, fmt.Errorf("invalid API base URL %q: %w", baseURL, err) + } + target, err := url.Parse(requestURL) + if err != nil { + return false, fmt.Errorf("invalid request URL %q: %w", requestURL, err) + } + return sameAPIOrigin(base, target), nil + default: + return false, fmt.Errorf("invalid auth policy %q (want: same-origin or any-origin)", policy) + } +} + +func sameAPIOrigin(a, b *url.URL) bool { + if a == nil || b == nil { + return false + } + return strings.EqualFold(a.Scheme, b.Scheme) && strings.EqualFold(a.Host, b.Host) +} + +func apiMethodRequiresRemoteWriteGuard(method string) bool { + switch strings.ToUpper(strings.TrimSpace(method)) { + case http.MethodPost, http.MethodPut, http.MethodPatch, http.MethodDelete: + return true + default: + return false + } +} + +func apiRemoteGuardCommand(opts apiCLIOptions) string { + if strings.TrimSpace(opts.commandName) != "" { + return strings.TrimSpace(opts.commandName) + } + return "api" +} + +func requireAPILocalOrAllowRemote(opts apiCLIOptions, allowRemote bool, command string) (bool, error) { + return requireAPILocalURLOrAllowRemote(opts.baseURL, allowRemote, command) +} + +func requireAPILocalURLOrAllowRemote(rawURL string, allowRemote bool, command string) (bool, error) { + local, err := isLocalAPIURL(rawURL) + if err != nil { + return false, err + } + if local { + return false, nil + } + if allowRemote { + return true, nil + } + return true, fmt.Errorf("%s refuses to modify non-local API URL %q without --allow-remote (local means localhost or loopback IP)", command, rawURL) +} + +func isLocalAPIURL(rawURL string) (bool, error) { + u, err := url.Parse(strings.TrimSpace(rawURL)) + if err != nil { + return false, fmt.Errorf("invalid API URL %q: %w", rawURL, err) + } + if !u.IsAbs() || u.Host == "" { + return false, fmt.Errorf("invalid API URL %q: must include scheme and host", rawURL) + } + host := strings.ToLower(strings.TrimSuffix(u.Hostname(), ".")) + if host == "localhost" || strings.HasSuffix(host, ".localhost") { + return true, nil + } + ip := net.ParseIP(host) + return ip != nil && ip.IsLoopback(), nil +} + +func applyAPIRequestHeaders(req *http.Request, opts apiCLIOptions, hasBody bool, sendAutoAuth bool) { + req.Header.Set("Accept", "application/json") + if hasBody { + req.Header.Set("Content-Type", "application/json") + } + if sendAutoAuth && strings.TrimSpace(opts.token) != "" { + req.Header.Set("Authorization", "Bearer "+strings.TrimSpace(opts.token)) + } + if sendAutoAuth && strings.TrimSpace(opts.idempotencyKey) != "" { + req.Header.Set("Idempotency-Key", strings.TrimSpace(opts.idempotencyKey)) + } + for _, raw := range opts.headers { + name, value, ok := strings.Cut(raw, ":") + if !ok { + continue + } + req.Header.Set(strings.TrimSpace(name), strings.TrimSpace(value)) + } +} + +func writeAPIRequestHeaders(w io.Writer, req *http.Request) { + path := req.URL.RequestURI() + if path == "" { + path = "/" + } + fmt.Fprintf(w, "> %s %s %s\n", req.Method, path, req.Proto) + fmt.Fprintf(w, "> Host: %s\n", req.URL.Host) + writeSortedHeaders(w, "> ", req.Header) + fmt.Fprintln(w, ">") +} + +func writeAPIResponseHeaders(w io.Writer, resp *http.Response) { + fmt.Fprintf(w, "< %s %s\n", resp.Proto, resp.Status) + writeSortedHeaders(w, "< ", resp.Header) + fmt.Fprintln(w, "<") +} + +func writeSortedHeaders(w io.Writer, prefix string, h http.Header) { + keys := make([]string, 0, len(h)) + for k := range h { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + for _, v := range h.Values(k) { + if isSensitiveAPIHeader(k) { + v = "[redacted]" + } + fmt.Fprintf(w, "%s%s: %s\n", prefix, k, v) + } + } +} + +func isSensitiveAPIHeader(name string) bool { + switch strings.ToLower(strings.TrimSpace(name)) { + case "authorization", "proxy-authorization", "idempotency-key", "cookie", "set-cookie", "x-api-key": + return true + default: + return false + } +} + +func writeAPIResponseBody(w io.Writer, body []byte, pretty bool) error { + body = bytes.TrimSpace(body) + if len(body) == 0 { + return nil + } + if pretty && json.Valid(body) { + var formatted bytes.Buffer + if err := json.Indent(&formatted, body, "", " "); err != nil { + return err + } + body = formatted.Bytes() + } + if _, err := w.Write(body); err != nil { + return err + } + if !bytes.HasSuffix(body, []byte("\n")) { + _, err := fmt.Fprintln(w) + return err + } + return nil +} + +func writeAPIValueOutput(w io.Writer, value any, opts apiCLIOptions) error { + if w == nil { + w = io.Discard + } + body, err := json.Marshal(value) + if err != nil { + return err + } + return writeAPIOutput(w, body, opts) +} + +func writeAPIOutput(w io.Writer, body []byte, opts apiCLIOptions) error { + if err := validateAPIOutputFormat(opts.output); err != nil { + return err + } + switch opts.output { + case "", "json": + return writeAPIResponseBody(w, body, opts.pretty) + case "table": + return writeAPIResponseTable(w, body) + } + return nil +} + +func validateAPIOutputFormat(output string) error { + switch output { + case "", "json", "table": + return nil + default: + return fmt.Errorf("output must be one of: json, table") + } +} + +func writeAPIResponseTable(w io.Writer, body []byte) error { + body = bytes.TrimSpace(body) + if len(body) == 0 { + return nil + } + var value any + if err := json.Unmarshal(body, &value); err != nil { + return err + } + rows := apiTableRows(value) + if len(rows) == 0 { + _, err := fmt.Fprintln(w, "no rows") + return err + } + columns := apiTableColumns(rows) + tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0) + for i, col := range columns { + if i > 0 { + fmt.Fprint(tw, "\t") + } + fmt.Fprint(tw, col) + } + fmt.Fprintln(tw) + for _, row := range rows { + for i, col := range columns { + if i > 0 { + fmt.Fprint(tw, "\t") + } + fmt.Fprint(tw, apiTableValue(row[col])) + } + fmt.Fprintln(tw) + } + return tw.Flush() +} + +func apiTableRows(value any) []map[string]any { + switch v := value.(type) { + case map[string]any: + if rows := apiWorkflowTableRows(v); len(rows) > 0 { + return rows + } + for _, key := range []string{"data", "created", "sites", "steps", "commands"} { + if data, ok := v[key].([]any); ok { + return apiRowsFromArray(data) + } + } + return []map[string]any{v} + case []any: + return apiRowsFromArray(v) + default: + return nil + } +} + +func apiWorkflowTableRows(value map[string]any) []map[string]any { + steps, ok := value["steps"].([]any) + if !ok { + return nil + } + rows := make([]map[string]any, 0, len(steps)) + for _, item := range steps { + step, ok := item.(map[string]any) + if !ok { + continue + } + row := map[string]any{"kind": "step"} + for k, v := range step { + row[k] = v + } + rows = append(rows, row) + } + cleanupResults, _ := value["cleanup_results"].([]any) + for _, item := range cleanupResults { + cleanup, ok := item.(map[string]any) + if !ok { + continue + } + row := map[string]any{ + "kind": "cleanup", + "name": cleanup["resource"], + "id": cleanup["id"], + "status": cleanup["status"], + } + if errText, ok := cleanup["error"]; ok { + row["detail"] = errText + } + rows = append(rows, row) + } + return rows +} + +func apiRowsFromArray(data []any) []map[string]any { + rows := make([]map[string]any, 0, len(data)) + for _, item := range data { + row, ok := item.(map[string]any) + if !ok { + continue + } + rows = append(rows, row) + } + return rows +} + +func apiTableColumns(rows []map[string]any) []string { + best := []string{} + for _, cols := range [][]string{ + {"id", "blog_id", "monitor_url", "monitor_active", "current_state", "current_severity", "active_event_id"}, + {"blog_id", "monitor_url", "monitor_active", "request_method", "detection_profile", "check_keyword", "redirect_policy", "timeout_seconds"}, + {"id", "site_id", "check_type", "state", "severity", "started_at", "ended_at"}, + {"id", "url", "active", "events", "secret_preview", "created_at"}, + {"id", "label", "active", "transport", "min_severity", "max_per_hour", "destination_preview"}, + {"id", "status", "attempt", "event_id", "event_type", "last_status_code", "created_at"}, + {"site_id", "status", "error"}, + {"site_id", "action", "trigger_status", "event_ids", "event_states", "event_severities", "transition_count", "note", "error"}, + {"site_id", "action", "note", "error"}, + {"kind", "name", "id", "status", "detail"}, + {"command", "description", "example"}, + {"name", "status", "detail"}, + } { + present := apiColumnsPresent(rows, cols) + if len(present) > len(best) { + best = present + } + } + if len(best) > 0 { + return best + } + seen := map[string]struct{}{} + for _, row := range rows { + for k := range row { + seen[k] = struct{}{} + } + } + cols := make([]string, 0, len(seen)) + for k := range seen { + cols = append(cols, k) + } + sort.Strings(cols) + return cols +} + +func apiColumnsPresent(rows []map[string]any, cols []string) []string { + out := []string{} + for _, col := range cols { + for _, row := range rows { + if _, ok := row[col]; ok { + out = append(out, col) + break + } + } + } + return out +} + +func apiTableValue(v any) string { + switch value := v.(type) { + case nil: + return "" + case string: + return value + case bool: + return fmt.Sprintf("%t", value) + case float64: + if value == float64(int64(value)) { + return fmt.Sprintf("%d", int64(value)) + } + return fmt.Sprintf("%g", value) + case []any: + parts := make([]string, 0, len(value)) + for _, item := range value { + parts = append(parts, apiTableValue(item)) + } + return strings.Join(parts, ",") + default: + b, err := json.Marshal(value) + if err != nil { + return fmt.Sprint(value) + } + return string(b) + } +} + +func logAPIErrorAndExit(err error) { + if errors.Is(err, flag.ErrHelp) { + os.Exit(0) + } + fmt.Fprintf(os.Stderr, "api: %v\n", err) + os.Exit(1) +} diff --git a/cmd/jetmon2/api_cli_alert_contacts.go b/cmd/jetmon2/api_cli_alert_contacts.go new file mode 100644 index 00000000..8dea9822 --- /dev/null +++ b/cmd/jetmon2/api_cli_alert_contacts.go @@ -0,0 +1,405 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "flag" + "fmt" + "net/http" + "net/url" + "strconv" + "strings" +) + +type apiAlertContactCreateOptions struct { + label string + active apiOptionalBoolFlag + transport string + destination apiAlertDestinationOptions + siteIDs apiInt64SliceFlags + minSeverity apiOptionalStringFlag + maxPerHour apiOptionalIntFlag +} + +type apiAlertContactUpdateOptions struct { + label apiOptionalStringFlag + active apiOptionalBoolFlag + destination apiAlertDestinationOptions + siteIDs apiInt64SliceFlags + clearSites bool + minSeverity apiOptionalStringFlag + maxPerHour apiOptionalIntFlag +} + +type apiAlertDestinationOptions struct { + raw string + address string + integrationKey string + webhookURL string +} + +type apiAlertDeliveriesFilters struct { + cursor string + limit int + status string +} + +type apiAlertContactSiteFilter struct { + SiteIDs []int64 `json:"site_ids,omitempty"` +} + +type apiAlertContactCreateRequest struct { + Label string `json:"label"` + Active *bool `json:"active,omitempty"` + Transport string `json:"transport"` + Destination json.RawMessage `json:"destination"` + SiteFilter apiAlertContactSiteFilter `json:"site_filter"` + MinSeverity *string `json:"min_severity,omitempty"` + MaxPerHour *int `json:"max_per_hour,omitempty"` +} + +type apiAlertContactUpdateRequest struct { + Label *string `json:"label,omitempty"` + Active *bool `json:"active,omitempty"` + Destination json.RawMessage `json:"destination,omitempty"` + SiteFilter *apiAlertContactSiteFilter `json:"site_filter,omitempty"` + MinSeverity *string `json:"min_severity,omitempty"` + MaxPerHour *int `json:"max_per_hour,omitempty"` +} + +func cmdAPIAlertContacts(args []string) error { + if len(args) == 0 { + return errors.New("usage: jetmon2 api alert-contacts [flags]") + } + + sub := args[0] + rest := args[1:] + switch sub { + case "list": + return cmdAPIAlertContactsList(rest) + case "get": + return cmdAPIAlertContactsGet(rest) + case "create": + return cmdAPIAlertContactsCreate(rest) + case "update": + return cmdAPIAlertContactsUpdate(rest) + case "delete": + return cmdAPIAlertContactsDelete(rest) + case "test": + return cmdAPIAlertContactsTest(rest) + case "deliveries": + return cmdAPIAlertContactsDeliveries(rest) + case "retry": + return cmdAPIAlertContactsRetry(rest) + default: + return fmt.Errorf("unknown api alert-contacts subcommand %q (want: list, get, create, update, delete, test, deliveries, retry)", sub) + } +} + +func cmdAPIAlertContactsList(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api alert-contacts list", &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return errors.New("usage: jetmon2 api alert-contacts list [flags]") + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, "/api/v1/alert-contacts", nil) +} + +func cmdAPIAlertContactsGet(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api alert-contacts get", &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api alert-contacts get [flags] ") + } + target, err := apiAlertContactPath(fs.Arg(0), "") + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, target, nil) +} + +func cmdAPIAlertContactsCreate(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api alert-contacts create", &opts) + addAPIIdempotencyFlag(fs, &opts) + create := apiAlertContactCreateOptions{} + fs.StringVar(&create.label, "label", "", "alert contact label") + fs.Var(&create.active, "active", "alert contact enabled: true or false") + fs.StringVar(&create.transport, "transport", "", "transport: email, pagerduty, slack, or teams") + addAPIAlertDestinationFlags(fs, &create.destination) + fs.Var(&create.siteIDs, "site-id", "site id filter (repeatable or comma-separated)") + fs.Var(&create.minSeverity, "min-severity", "minimum severity: Up, Warning, Degraded, SeemsDown, or Down") + fs.Var(&create.maxPerHour, "max-per-hour", "maximum notifications per hour, 0 for unlimited") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return errors.New("usage: jetmon2 api alert-contacts create [flags]") + } + body, err := marshalAPIAlertContactCreateBody(create) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodPost, "/api/v1/alert-contacts", body) +} + +func cmdAPIAlertContactsUpdate(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api alert-contacts update", &opts) + update := apiAlertContactUpdateOptions{} + fs.Var(&update.label, "label", "alert contact label") + fs.Var(&update.active, "active", "alert contact enabled: true or false") + addAPIAlertDestinationFlags(fs, &update.destination) + fs.Var(&update.siteIDs, "site-id", "site id filter (repeatable or comma-separated)") + fs.BoolVar(&update.clearSites, "clear-sites", false, "clear site filters") + fs.Var(&update.minSeverity, "min-severity", "minimum severity: Up, Warning, Degraded, SeemsDown, or Down") + fs.Var(&update.maxPerHour, "max-per-hour", "maximum notifications per hour, 0 for unlimited") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api alert-contacts update [flags] ") + } + target, err := apiAlertContactPath(fs.Arg(0), "") + if err != nil { + return err + } + body, err := marshalAPIAlertContactUpdateBody(update) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodPatch, target, body) +} + +func cmdAPIAlertContactsDelete(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api alert-contacts delete", &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api alert-contacts delete [flags] ") + } + target, err := apiAlertContactPath(fs.Arg(0), "") + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodDelete, target, nil) +} + +func cmdAPIAlertContactsTest(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api alert-contacts test", &opts) + addAPIIdempotencyFlag(fs, &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api alert-contacts test [flags] ") + } + target, err := apiAlertContactPath(fs.Arg(0), "test") + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodPost, target, nil) +} + +func cmdAPIAlertContactsDeliveries(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api alert-contacts deliveries", &opts) + filters := apiAlertDeliveriesFilters{} + fs.StringVar(&filters.cursor, "cursor", "", "pagination cursor") + fs.IntVar(&filters.limit, "limit", 0, "page size (1-200)") + fs.StringVar(&filters.status, "status", "", "delivery status: pending, delivered, failed, or abandoned") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api alert-contacts deliveries [flags] ") + } + target, err := apiAlertContactDeliveriesPath(fs.Arg(0), filters) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, target, nil) +} + +func cmdAPIAlertContactsRetry(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api alert-contacts retry", &opts) + addAPIIdempotencyFlag(fs, &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 2 { + return errors.New("usage: jetmon2 api alert-contacts retry [flags] ") + } + target, err := apiAlertContactRetryPath(fs.Arg(0), fs.Arg(1)) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodPost, target, nil) +} + +func addAPIAlertDestinationFlags(fs *flag.FlagSet, dest *apiAlertDestinationOptions) { + fs.StringVar(&dest.raw, "destination", "", "raw destination JSON") + fs.StringVar(&dest.address, "address", "", "email destination address") + fs.StringVar(&dest.integrationKey, "integration-key", "", "PagerDuty Events API v2 integration key") + fs.StringVar(&dest.webhookURL, "webhook-url", "", "Slack or Teams incoming webhook URL") +} + +func apiAlertContactPath(rawID, suffix string) (string, error) { + id, err := apiPositiveID(rawID, "alert contact") + if err != nil { + return "", err + } + path := "/api/v1/alert-contacts/" + strconv.FormatInt(id, 10) + if suffix != "" { + path += "/" + strings.TrimPrefix(suffix, "/") + } + return path, nil +} + +func apiAlertContactDeliveriesPath(rawID string, filters apiAlertDeliveriesFilters) (string, error) { + path, err := apiAlertContactPath(rawID, "deliveries") + if err != nil { + return "", err + } + if filters.limit < 0 { + return "", errors.New("limit must be positive") + } + + values := url.Values{} + if filters.cursor != "" { + values.Set("cursor", filters.cursor) + } + if filters.limit > 0 { + values.Set("limit", strconv.Itoa(filters.limit)) + } + if filters.status != "" { + switch filters.status { + case "pending", "delivered", "failed", "abandoned": + values.Set("status", filters.status) + default: + return "", errors.New("status must be one of: pending, delivered, failed, abandoned") + } + } + if len(values) == 0 { + return path, nil + } + return path + "?" + values.Encode(), nil +} + +func apiAlertContactRetryPath(rawContactID, rawDeliveryID string) (string, error) { + contactID, err := apiPositiveID(rawContactID, "alert contact") + if err != nil { + return "", err + } + deliveryID, err := apiPositiveID(rawDeliveryID, "delivery") + if err != nil { + return "", err + } + return fmt.Sprintf("/api/v1/alert-contacts/%d/deliveries/%d/retry", contactID, deliveryID), nil +} + +func marshalAPIAlertContactCreateBody(opts apiAlertContactCreateOptions) ([]byte, error) { + if strings.TrimSpace(opts.label) == "" { + return nil, errors.New("label is required") + } + if strings.TrimSpace(opts.transport) == "" { + return nil, errors.New("transport is required") + } + destination, err := opts.destination.rawForTransport(opts.transport, true) + if err != nil { + return nil, err + } + req := apiAlertContactCreateRequest{ + Label: opts.label, + Active: opts.active.ptr(), + Transport: opts.transport, + Destination: destination, + SiteFilter: apiAlertContactSiteFilter{SiteIDs: opts.siteIDs.valuesOrEmpty()}, + MinSeverity: opts.minSeverity.ptr(), + MaxPerHour: opts.maxPerHour.ptr(), + } + return json.Marshal(req) +} + +func marshalAPIAlertContactUpdateBody(opts apiAlertContactUpdateOptions) ([]byte, error) { + if opts.clearSites && opts.siteIDs.set { + return nil, errors.New("use --site-id or --clear-sites, not both") + } + destination, err := opts.destination.rawForTransport("", false) + if err != nil { + return nil, err + } + req := apiAlertContactUpdateRequest{ + Label: opts.label.ptr(), + Active: opts.active.ptr(), + Destination: destination, + MinSeverity: opts.minSeverity.ptr(), + MaxPerHour: opts.maxPerHour.ptr(), + } + if opts.siteIDs.set || opts.clearSites { + req.SiteFilter = &apiAlertContactSiteFilter{SiteIDs: opts.siteIDs.valuesOrEmpty()} + } + return json.Marshal(req) +} + +func (opts apiAlertDestinationOptions) rawForTransport(transport string, required bool) (json.RawMessage, error) { + set := 0 + for _, v := range []string{opts.raw, opts.address, opts.integrationKey, opts.webhookURL} { + if strings.TrimSpace(v) != "" { + set++ + } + } + if set == 0 { + if required { + return nil, errors.New("destination is required") + } + return nil, nil + } + if set > 1 { + return nil, errors.New("use only one destination flag") + } + if opts.raw != "" { + if !json.Valid([]byte(opts.raw)) { + return nil, errors.New("destination must be valid JSON") + } + return json.RawMessage(opts.raw), nil + } + + var value any + switch { + case opts.address != "": + if transport != "" && transport != "email" { + return nil, errors.New("--address requires --transport email") + } + value = map[string]string{"address": opts.address} + case opts.integrationKey != "": + if transport != "" && transport != "pagerduty" { + return nil, errors.New("--integration-key requires --transport pagerduty") + } + value = map[string]string{"integration_key": opts.integrationKey} + case opts.webhookURL != "": + if transport != "" && transport != "slack" && transport != "teams" { + return nil, errors.New("--webhook-url requires --transport slack or teams") + } + value = map[string]string{"webhook_url": opts.webhookURL} + default: + return nil, errors.New("destination is required") + } + + b, err := json.Marshal(value) + if err != nil { + return nil, err + } + return json.RawMessage(b), nil +} diff --git a/cmd/jetmon2/api_cli_alert_contacts_test.go b/cmd/jetmon2/api_cli_alert_contacts_test.go new file mode 100644 index 00000000..eb0be12e --- /dev/null +++ b/cmd/jetmon2/api_cli_alert_contacts_test.go @@ -0,0 +1,196 @@ +package main + +import ( + "encoding/json" + "net/url" + "testing" +) + +func TestMarshalAPIAlertContactCreateBody(t *testing.T) { + var active apiOptionalBoolFlag + setTestFlag(t, &active, "false") + var siteIDs apiInt64SliceFlags + setTestFlag(t, &siteIDs, "42,99") + var minSeverity apiOptionalStringFlag + setTestFlag(t, &minSeverity, "Warning") + var maxPerHour apiOptionalIntFlag + setTestFlag(t, &maxPerHour, "0") + + body, err := marshalAPIAlertContactCreateBody(apiAlertContactCreateOptions{ + label: "ops-email", + active: active, + transport: "email", + destination: apiAlertDestinationOptions{address: "ops@example.com"}, + siteIDs: siteIDs, + minSeverity: minSeverity, + maxPerHour: maxPerHour, + }) + if err != nil { + t.Fatalf("marshalAPIAlertContactCreateBody() error = %v", err) + } + var got map[string]any + if err := json.Unmarshal(body, &got); err != nil { + t.Fatalf("unmarshal body: %v", err) + } + if got["label"] != "ops-email" { + t.Fatalf("label = %#v", got["label"]) + } + if got["active"] != false { + t.Fatalf("active = %#v, want false", got["active"]) + } + if got["transport"] != "email" { + t.Fatalf("transport = %#v, want email", got["transport"]) + } + dest := got["destination"].(map[string]any) + if dest["address"] != "ops@example.com" { + t.Fatalf("destination.address = %#v", dest["address"]) + } + siteFilter := got["site_filter"].(map[string]any) + assertNumberArray(t, siteFilter["site_ids"], []int64{42, 99}) + if got["min_severity"] != "Warning" { + t.Fatalf("min_severity = %#v, want Warning", got["min_severity"]) + } + if got["max_per_hour"] != float64(0) { + t.Fatalf("max_per_hour = %#v, want 0", got["max_per_hour"]) + } +} + +func TestMarshalAPIAlertContactCreateBodyBuildsTransportDestinations(t *testing.T) { + tests := []struct { + name string + transport string + destination apiAlertDestinationOptions + wantKey string + wantValue string + }{ + { + name: "pagerduty", + transport: "pagerduty", + destination: apiAlertDestinationOptions{integrationKey: "pd-key"}, + wantKey: "integration_key", + wantValue: "pd-key", + }, + { + name: "slack", + transport: "slack", + destination: apiAlertDestinationOptions{webhookURL: "https://hooks.slack.com/services/test"}, + wantKey: "webhook_url", + wantValue: "https://hooks.slack.com/services/test", + }, + { + name: "teams", + transport: "teams", + destination: apiAlertDestinationOptions{webhookURL: "https://outlook.office.com/webhook/test"}, + wantKey: "webhook_url", + wantValue: "https://outlook.office.com/webhook/test", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + body, err := marshalAPIAlertContactCreateBody(apiAlertContactCreateOptions{ + label: tt.name, + transport: tt.transport, + destination: tt.destination, + }) + if err != nil { + t.Fatalf("marshalAPIAlertContactCreateBody() error = %v", err) + } + var got map[string]any + if err := json.Unmarshal(body, &got); err != nil { + t.Fatalf("unmarshal body: %v", err) + } + dest := got["destination"].(map[string]any) + if dest[tt.wantKey] != tt.wantValue { + t.Fatalf("destination[%s] = %#v, want %q", tt.wantKey, dest[tt.wantKey], tt.wantValue) + } + }) + } +} + +func TestMarshalAPIAlertContactUpdateBodySupportsDestinationAndClearSites(t *testing.T) { + var label apiOptionalStringFlag + setTestFlag(t, &label, "platform-oncall") + + body, err := marshalAPIAlertContactUpdateBody(apiAlertContactUpdateOptions{ + label: label, + destination: apiAlertDestinationOptions{raw: `{"webhook_url":"https://example.com/hook"}`}, + clearSites: true, + }) + if err != nil { + t.Fatalf("marshalAPIAlertContactUpdateBody() error = %v", err) + } + var got map[string]any + if err := json.Unmarshal(body, &got); err != nil { + t.Fatalf("unmarshal body: %v", err) + } + if got["label"] != "platform-oncall" { + t.Fatalf("label = %#v", got["label"]) + } + dest := got["destination"].(map[string]any) + if dest["webhook_url"] != "https://example.com/hook" { + t.Fatalf("destination.webhook_url = %#v", dest["webhook_url"]) + } + if _, ok := got["site_filter"].(map[string]any)["site_ids"]; ok { + t.Fatalf("site_ids present in cleared site_filter: %#v", got["site_filter"]) + } +} + +func TestMarshalAPIAlertContactUpdateBodyRejectsConflicts(t *testing.T) { + var siteIDs apiInt64SliceFlags + setTestFlag(t, &siteIDs, "42") + if _, err := marshalAPIAlertContactUpdateBody(apiAlertContactUpdateOptions{siteIDs: siteIDs, clearSites: true}); err == nil { + t.Fatal("site filter conflict error = nil, want error") + } + if _, err := (apiAlertDestinationOptions{raw: `{}`, address: "ops@example.com"}).rawForTransport("", false); err == nil { + t.Fatal("destination conflict error = nil, want error") + } + if _, err := (apiAlertDestinationOptions{raw: `{not-json}`}).rawForTransport("", false); err == nil { + t.Fatal("invalid raw destination error = nil, want error") + } +} + +func TestAPIAlertContactPaths(t *testing.T) { + got, err := apiAlertContactPath("17", "test") + if err != nil { + t.Fatalf("apiAlertContactPath() error = %v", err) + } + if got != "/api/v1/alert-contacts/17/test" { + t.Fatalf("path = %q, want test path", got) + } + + got, err = apiAlertContactRetryPath("17", "88") + if err != nil { + t.Fatalf("apiAlertContactRetryPath() error = %v", err) + } + if got != "/api/v1/alert-contacts/17/deliveries/88/retry" { + t.Fatalf("retry path = %q, want delivery retry path", got) + } +} + +func TestAPIAlertContactDeliveriesPath(t *testing.T) { + got, err := apiAlertContactDeliveriesPath("17", apiAlertDeliveriesFilters{ + cursor: "cur-5", + limit: 50, + status: "failed", + }) + if err != nil { + t.Fatalf("apiAlertContactDeliveriesPath() error = %v", err) + } + u, err := url.Parse(got) + if err != nil { + t.Fatalf("parse path: %v", err) + } + if u.Path != "/api/v1/alert-contacts/17/deliveries" { + t.Fatalf("path = %q, want deliveries path", u.Path) + } + for key, want := range map[string]string{ + "cursor": "cur-5", + "limit": "50", + "status": "failed", + } { + if got := u.Query().Get(key); got != want { + t.Fatalf("query %s = %q, want %q", key, got, want) + } + } +} diff --git a/cmd/jetmon2/api_cli_events.go b/cmd/jetmon2/api_cli_events.go new file mode 100644 index 00000000..3f66a682 --- /dev/null +++ b/cmd/jetmon2/api_cli_events.go @@ -0,0 +1,270 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "net/url" + "strconv" + "strings" +) + +type apiEventsListFilters struct { + cursor string + limit int + state string + stateIn string + checkType string + checkTypeIn string + startedAtGTE string + startedAtLT string + active string +} + +type apiTransitionsListFilters struct { + cursor string + limit int +} + +type apiEventCloseOptions struct { + reason string + note string +} + +type apiEventCloseRequest struct { + Reason string `json:"reason,omitempty"` + Note string `json:"note,omitempty"` +} + +func cmdAPIEvents(args []string) error { + if len(args) == 0 { + return errors.New("usage: jetmon2 api events [flags]") + } + + sub := args[0] + rest := args[1:] + switch sub { + case "list": + return cmdAPIEventsList(rest) + case "get": + return cmdAPIEventsGet(rest) + case "transitions": + return cmdAPIEventsTransitions(rest) + case "close": + return cmdAPIEventsClose(rest) + default: + return fmt.Errorf("unknown api events subcommand %q (want: list, get, transitions, close)", sub) + } +} + +func cmdAPIEventsList(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api events list", &opts) + filters := apiEventsListFilters{} + fs.StringVar(&filters.cursor, "cursor", "", "pagination cursor") + fs.IntVar(&filters.limit, "limit", 0, "page size (1-200)") + fs.StringVar(&filters.state, "state", "", "filter by event state") + fs.StringVar(&filters.stateIn, "state-in", "", "comma-separated event states") + fs.StringVar(&filters.checkType, "check-type", "", "filter by check type") + fs.StringVar(&filters.checkTypeIn, "check-type-in", "", "comma-separated check types") + fs.StringVar(&filters.startedAtGTE, "started-at-gte", "", "filter events started at or after this RFC3339 timestamp") + fs.StringVar(&filters.startedAtLT, "started-at-lt", "", "filter events started before this RFC3339 timestamp") + fs.StringVar(&filters.active, "active", "", "filter open events: true or false") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api events list [flags] ") + } + target, err := apiEventsListPath(fs.Arg(0), filters) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, target, nil) +} + +func cmdAPIEventsGet(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api events get", &opts) + var siteID string + fs.StringVar(&siteID, "site-id", "", "optional site id for site-scoped event lookup") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api events get [flags] ") + } + target, err := apiEventDetailPath(siteID, fs.Arg(0)) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, target, nil) +} + +func cmdAPIEventsTransitions(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api events transitions", &opts) + filters := apiTransitionsListFilters{} + fs.StringVar(&filters.cursor, "cursor", "", "pagination cursor") + fs.IntVar(&filters.limit, "limit", 0, "page size (1-200)") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 2 { + return errors.New("usage: jetmon2 api events transitions [flags] ") + } + target, err := apiEventTransitionsPath(fs.Arg(0), fs.Arg(1), filters) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, target, nil) +} + +func cmdAPIEventsClose(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api events close", &opts) + addAPIIdempotencyFlag(fs, &opts) + closeOpts := apiEventCloseOptions{} + fs.StringVar(&closeOpts.reason, "reason", "", "resolution reason (default: manual_override)") + fs.StringVar(&closeOpts.note, "note", "", "operator note recorded in transition metadata") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 2 { + return errors.New("usage: jetmon2 api events close [flags] ") + } + target, err := apiEventClosePath(fs.Arg(0), fs.Arg(1)) + if err != nil { + return err + } + body, err := marshalAPIEventCloseBody(closeOpts) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodPost, target, body) +} + +func apiEventsListPath(rawSiteID string, filters apiEventsListFilters) (string, error) { + siteID, err := apiPositiveID(rawSiteID, "site") + if err != nil { + return "", err + } + if filters.limit < 0 { + return "", errors.New("limit must be positive") + } + if filters.state != "" && filters.stateIn != "" { + return "", errors.New("use --state or --state-in, not both") + } + if filters.checkType != "" && filters.checkTypeIn != "" { + return "", errors.New("use --check-type or --check-type-in, not both") + } + + values := url.Values{} + if filters.cursor != "" { + values.Set("cursor", filters.cursor) + } + if filters.limit > 0 { + values.Set("limit", strconv.Itoa(filters.limit)) + } + if filters.state != "" { + values.Set("state", filters.state) + } + if filters.stateIn != "" { + values.Set("state__in", filters.stateIn) + } + if filters.checkType != "" { + values.Set("check_type", filters.checkType) + } + if filters.checkTypeIn != "" { + values.Set("check_type__in", filters.checkTypeIn) + } + if filters.startedAtGTE != "" { + values.Set("started_at__gte", filters.startedAtGTE) + } + if filters.startedAtLT != "" { + values.Set("started_at__lt", filters.startedAtLT) + } + if strings.TrimSpace(filters.active) != "" { + active, err := strconv.ParseBool(filters.active) + if err != nil { + return "", errors.New("active must be true or false") + } + values.Set("active", strconv.FormatBool(active)) + } + + path := "/api/v1/sites/" + strconv.FormatInt(siteID, 10) + "/events" + if len(values) == 0 { + return path, nil + } + return path + "?" + values.Encode(), nil +} + +func apiEventDetailPath(rawSiteID, rawEventID string) (string, error) { + eventID, err := apiPositiveID(rawEventID, "event") + if err != nil { + return "", err + } + if rawSiteID == "" { + return "/api/v1/events/" + strconv.FormatInt(eventID, 10), nil + } + siteID, err := apiPositiveID(rawSiteID, "site") + if err != nil { + return "", err + } + return fmt.Sprintf("/api/v1/sites/%d/events/%d", siteID, eventID), nil +} + +func apiEventTransitionsPath(rawSiteID, rawEventID string, filters apiTransitionsListFilters) (string, error) { + path, err := apiEventDetailPath(rawSiteID, rawEventID) + if err != nil { + return "", err + } + if rawSiteID == "" { + return "", errors.New("site id is required for transitions") + } + if filters.limit < 0 { + return "", errors.New("limit must be positive") + } + + values := url.Values{} + if filters.cursor != "" { + values.Set("cursor", filters.cursor) + } + if filters.limit > 0 { + values.Set("limit", strconv.Itoa(filters.limit)) + } + path += "/transitions" + if len(values) == 0 { + return path, nil + } + return path + "?" + values.Encode(), nil +} + +func apiEventClosePath(rawSiteID, rawEventID string) (string, error) { + path, err := apiEventDetailPath(rawSiteID, rawEventID) + if err != nil { + return "", err + } + if rawSiteID == "" { + return "", errors.New("site id is required for close") + } + return path + "/close", nil +} + +func marshalAPIEventCloseBody(opts apiEventCloseOptions) ([]byte, error) { + req := apiEventCloseRequest{ + Reason: opts.reason, + Note: opts.note, + } + return json.Marshal(req) +} + +func apiPositiveID(raw, label string) (int64, error) { + id, err := strconv.ParseInt(raw, 10, 64) + if err != nil || id <= 0 { + return 0, fmt.Errorf("%s id must be a positive integer", label) + } + return id, nil +} diff --git a/cmd/jetmon2/api_cli_events_test.go b/cmd/jetmon2/api_cli_events_test.go new file mode 100644 index 00000000..a7c619ef --- /dev/null +++ b/cmd/jetmon2/api_cli_events_test.go @@ -0,0 +1,131 @@ +package main + +import ( + "encoding/json" + "net/url" + "testing" +) + +func TestAPIEventsListPath(t *testing.T) { + got, err := apiEventsListPath("42", apiEventsListFilters{ + cursor: "cur-2", + limit: 20, + state: "Down", + checkTypeIn: "http,tls_expiry", + startedAtGTE: "2026-04-28T10:00:00Z", + startedAtLT: "2026-04-29T10:00:00Z", + active: "true", + }) + if err != nil { + t.Fatalf("apiEventsListPath() error = %v", err) + } + u, err := url.Parse(got) + if err != nil { + t.Fatalf("parse path: %v", err) + } + if u.Path != "/api/v1/sites/42/events" { + t.Fatalf("path = %q, want site events path", u.Path) + } + q := u.Query() + for key, want := range map[string]string{ + "cursor": "cur-2", + "limit": "20", + "state": "Down", + "check_type__in": "http,tls_expiry", + "started_at__gte": "2026-04-28T10:00:00Z", + "started_at__lt": "2026-04-29T10:00:00Z", + "active": "true", + } { + if got := q.Get(key); got != want { + t.Fatalf("query %s = %q, want %q", key, got, want) + } + } +} + +func TestAPIEventsListPathRejectsAmbiguousFilters(t *testing.T) { + if _, err := apiEventsListPath("42", apiEventsListFilters{state: "Down", stateIn: "Up,Down"}); err == nil { + t.Fatal("apiEventsListPath() state error = nil, want error") + } + if _, err := apiEventsListPath("42", apiEventsListFilters{checkType: "http", checkTypeIn: "http,tls"}); err == nil { + t.Fatal("apiEventsListPath() check type error = nil, want error") + } +} + +func TestAPIEventDetailPath(t *testing.T) { + got, err := apiEventDetailPath("", "99") + if err != nil { + t.Fatalf("apiEventDetailPath() direct error = %v", err) + } + if got != "/api/v1/events/99" { + t.Fatalf("direct path = %q, want /api/v1/events/99", got) + } + + got, err = apiEventDetailPath("42", "99") + if err != nil { + t.Fatalf("apiEventDetailPath() scoped error = %v", err) + } + if got != "/api/v1/sites/42/events/99" { + t.Fatalf("scoped path = %q, want site-scoped event path", got) + } +} + +func TestAPIEventTransitionsPath(t *testing.T) { + got, err := apiEventTransitionsPath("42", "99", apiTransitionsListFilters{ + cursor: "cur-3", + limit: 100, + }) + if err != nil { + t.Fatalf("apiEventTransitionsPath() error = %v", err) + } + u, err := url.Parse(got) + if err != nil { + t.Fatalf("parse path: %v", err) + } + if u.Path != "/api/v1/sites/42/events/99/transitions" { + t.Fatalf("path = %q, want transitions path", u.Path) + } + if got := u.Query().Get("cursor"); got != "cur-3" { + t.Fatalf("cursor = %q, want cur-3", got) + } + if got := u.Query().Get("limit"); got != "100" { + t.Fatalf("limit = %q, want 100", got) + } +} + +func TestAPIEventClosePath(t *testing.T) { + got, err := apiEventClosePath("42", "99") + if err != nil { + t.Fatalf("apiEventClosePath() error = %v", err) + } + if got != "/api/v1/sites/42/events/99/close" { + t.Fatalf("path = %q, want close path", got) + } +} + +func TestMarshalAPIEventCloseBody(t *testing.T) { + body, err := marshalAPIEventCloseBody(apiEventCloseOptions{ + reason: "false_alarm", + note: "verified from dashboard", + }) + if err != nil { + t.Fatalf("marshalAPIEventCloseBody() error = %v", err) + } + var got map[string]any + if err := json.Unmarshal(body, &got); err != nil { + t.Fatalf("unmarshal body: %v", err) + } + if got["reason"] != "false_alarm" { + t.Fatalf("reason = %#v, want false_alarm", got["reason"]) + } + if got["note"] != "verified from dashboard" { + t.Fatalf("note = %#v, want dashboard note", got["note"]) + } + + body, err = marshalAPIEventCloseBody(apiEventCloseOptions{}) + if err != nil { + t.Fatalf("marshalAPIEventCloseBody(empty) error = %v", err) + } + if string(body) != "{}" { + t.Fatalf("empty body = %s, want {}", body) + } +} diff --git a/cmd/jetmon2/api_cli_remote_guard_test.go b/cmd/jetmon2/api_cli_remote_guard_test.go new file mode 100644 index 00000000..5ceac23c --- /dev/null +++ b/cmd/jetmon2/api_cli_remote_guard_test.go @@ -0,0 +1,253 @@ +package main + +import ( + "bytes" + "context" + "strings" + "testing" +) + +func TestIsLocalAPIBaseURL(t *testing.T) { + tests := []struct { + name string + baseURL string + want bool + }{ + {name: "localhost", baseURL: "http://localhost:8090", want: true}, + {name: "localhost subdomain", baseURL: "http://jetmon.localhost:8090", want: true}, + {name: "ipv4 loopback", baseURL: "http://127.0.0.1:8090", want: true}, + {name: "ipv6 loopback", baseURL: "http://[::1]:8090", want: true}, + {name: "private lan is remote", baseURL: "http://10.0.0.171:8090", want: false}, + {name: "public hostname", baseURL: "https://jetmon-api.example.test", want: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := isLocalAPIURL(tt.baseURL) + if err != nil { + t.Fatalf("isLocalAPIURL() error = %v", err) + } + if got != tt.want { + t.Fatalf("isLocalAPIURL(%q) = %v, want %v", tt.baseURL, got, tt.want) + } + }) + } +} + +func TestExecuteAPIRequestRejectsRemoteWrite(t *testing.T) { + err := executeAPIRequest(context.Background(), nil, apiCLIOptions{ + baseURL: "https://jetmon-api.example.test", + out: ioDiscard{}, + errOut: ioDiscard{}, + }, "POST", "/api/v1/sites", []byte(`{}`)) + if err == nil || !strings.Contains(err.Error(), "--allow-remote") { + t.Fatalf("executeAPIRequest() error = %v, want --allow-remote refusal", err) + } +} + +func TestExecuteAPIRequestRejectsAbsoluteRemoteWriteWithLocalBase(t *testing.T) { + err := executeAPIRequest(context.Background(), nil, apiCLIOptions{ + baseURL: "http://localhost:8090", + out: ioDiscard{}, + errOut: ioDiscard{}, + }, "DELETE", "https://jetmon-api.example.test/api/v1/sites/42", nil) + if err == nil || !strings.Contains(err.Error(), "--allow-remote") { + t.Fatalf("executeAPIRequest() error = %v, want --allow-remote refusal", err) + } +} + +func TestRemoteWorkflowGuardRequiresAllowRemote(t *testing.T) { + opts := apiCLIOptions{baseURL: "https://jetmon-api.example.test"} + remote, err := requireAPILocalOrAllowRemote(opts, false, "api smoke") + if err == nil { + t.Fatal("requireAPILocalOrAllowRemote() error = nil, want refusal") + } + if !remote { + t.Fatal("remote = false, want true") + } + if !strings.Contains(err.Error(), "--allow-remote") { + t.Fatalf("error = %v, want --allow-remote hint", err) + } + + remote, err = requireAPILocalOrAllowRemote(opts, true, "api smoke") + if err != nil { + t.Fatalf("requireAPILocalOrAllowRemote(... allow) error = %v", err) + } + if !remote { + t.Fatal("remote = false with remote URL and allow flag, want true") + } +} + +func TestRunAPISitesBulkAddRemoteGuard(t *testing.T) { + opts := apiCLIOptions{baseURL: "https://jetmon-api.example.test", out: ioDiscard{}, errOut: ioDiscard{}} + bulk := apiSitesBulkAddOptions{ + count: 1, + batch: "remote-batch", + source: "fixture", + blogIDStart: defaultAPIBulkAddBlogIDStart, + } + err := runAPISitesBulkAdd(context.Background(), nil, opts, bulk) + if err == nil || !strings.Contains(err.Error(), "--allow-remote") { + t.Fatalf("runAPISitesBulkAdd() error = %v, want --allow-remote refusal", err) + } + + opts.allowRemote = true + bulk.batch = "" + err = runAPISitesBulkAdd(context.Background(), nil, opts, bulk) + if err == nil || !strings.Contains(err.Error(), "requires --batch") { + t.Fatalf("runAPISitesBulkAdd() error = %v, want remote batch requirement", err) + } +} + +func TestRunAPISitesBulkAddDryRunAllowsRemotePlanning(t *testing.T) { + var stdout bytes.Buffer + err := runAPISitesBulkAdd(context.Background(), nil, apiCLIOptions{ + baseURL: "https://jetmon-api.example.test", + out: &stdout, + errOut: ioDiscard{}, + }, apiSitesBulkAddOptions{ + count: 1, + source: "fixture", + blogIDStart: defaultAPIBulkAddBlogIDStart, + dryRun: true, + }) + if err != nil { + t.Fatalf("runAPISitesBulkAdd() dry-run error = %v", err) + } + if !strings.Contains(stdout.String(), `"dry_run":true`) { + t.Fatalf("stdout = %s, want dry-run output", stdout.String()) + } +} + +func TestRunAPISitesCleanupRemoteGuard(t *testing.T) { + opts := apiCLIOptions{baseURL: "https://jetmon-api.example.test", out: ioDiscard{}, errOut: ioDiscard{}} + cleanup := apiSitesCleanupOptions{batch: "remote-batch", count: 1, ignoreNotFound: true} + err := runAPISitesCleanup(context.Background(), nil, opts, cleanup) + if err == nil || !strings.Contains(err.Error(), "--allow-remote") { + t.Fatalf("runAPISitesCleanup() error = %v, want --allow-remote refusal", err) + } + + opts.allowRemote = true + cleanup.allowUnmarked = true + err = runAPISitesCleanup(context.Background(), nil, opts, cleanup) + if err == nil || !strings.Contains(err.Error(), "cannot use --allow-unmarked") { + t.Fatalf("runAPISitesCleanup() error = %v, want allow-unmarked refusal", err) + } +} + +func TestRunAPISitesSimulateFailureRemoteGuard(t *testing.T) { + opts := apiCLIOptions{baseURL: "https://jetmon-api.example.test", out: ioDiscard{}, errOut: ioDiscard{}} + sim := apiSitesSimulateFailureOptions{ + mode: "http-500", + batch: "remote-batch", + count: 1, + trigger: false, + pollInterval: 1, + } + err := runAPISitesSimulateFailure(context.Background(), nil, opts, sim) + if err == nil || !strings.Contains(err.Error(), "--allow-remote") { + t.Fatalf("runAPISitesSimulateFailure() error = %v, want --allow-remote refusal", err) + } + + opts.allowRemote = true + sim.batch = "" + err = runAPISitesSimulateFailure(context.Background(), nil, opts, sim) + if err == nil || !strings.Contains(err.Error(), "requires --batch") { + t.Fatalf("runAPISitesSimulateFailure() error = %v, want remote batch requirement", err) + } +} + +func TestRunAPISmokeRemoteGuard(t *testing.T) { + err := runAPISmoke(context.Background(), nil, apiCLIOptions{ + baseURL: "https://jetmon-api.example.test", + out: ioDiscard{}, + errOut: ioDiscard{}, + }, apiSmokeOptions{batch: "remote-smoke", exercise: "none"}) + if err == nil || !strings.Contains(err.Error(), "--allow-remote") { + t.Fatalf("runAPISmoke() error = %v, want --allow-remote refusal", err) + } + + err = runAPISmoke(context.Background(), nil, apiCLIOptions{ + baseURL: "https://jetmon-api.example.test", + allowRemote: true, + out: ioDiscard{}, + errOut: ioDiscard{}, + }, apiSmokeOptions{exercise: "none"}) + if err == nil || !strings.Contains(err.Error(), "requires --batch") { + t.Fatalf("runAPISmoke() error = %v, want remote batch requirement", err) + } +} + +func TestRunAPISmokeWebhookExerciseRemoteGuard(t *testing.T) { + err := runAPISmoke(context.Background(), nil, apiCLIOptions{ + baseURL: "https://jetmon-api.example.test", + allowRemote: true, + out: ioDiscard{}, + errOut: ioDiscard{}, + }, apiSmokeOptions{ + batch: "remote-smoke", + exercise: "webhook", + }) + if err == nil || !strings.Contains(err.Error(), "Docker-local only") { + t.Fatalf("runAPISmoke() error = %v, want Docker-local webhook refusal", err) + } +} + +func TestRunAPISmokeWebhookRequiresLocalRequestsURL(t *testing.T) { + err := runAPISmoke(context.Background(), nil, apiCLIOptions{ + baseURL: "http://localhost:8090", + out: ioDiscard{}, + errOut: ioDiscard{}, + }, apiSmokeOptions{ + batch: "local-smoke", + exercise: "webhook", + webhookRequestsURL: "https://fixture.example.test/webhook/requests", + }) + if err == nil || !strings.Contains(err.Error(), "webhook-requests-url must be local") { + t.Fatalf("runAPISmoke() error = %v, want local webhook requests URL refusal", err) + } +} + +func TestRunAPISmokeWebhookRejectsExternalWebhookURL(t *testing.T) { + err := runAPISmoke(context.Background(), nil, apiCLIOptions{ + baseURL: "http://localhost:8090", + out: ioDiscard{}, + errOut: ioDiscard{}, + }, apiSmokeOptions{ + batch: "local-smoke", + exercise: "webhook", + webhookURL: "https://receiver.example.test/webhook", + webhookRequestsURL: "http://localhost:18091/webhook/requests", + }) + if err == nil || !strings.Contains(err.Error(), "allow-external-webhook-url") { + t.Fatalf("runAPISmoke() error = %v, want external webhook URL refusal", err) + } +} + +func TestRequireAPIWebhookFixtureURLAllowed(t *testing.T) { + tests := []struct { + name string + rawURL string + allowExternal bool + wantErr bool + }{ + {name: "api fixture", rawURL: "http://api-fixture:8091/webhook"}, + {name: "localhost", rawURL: "http://localhost:18091/webhook"}, + {name: "loopback", rawURL: "http://127.0.0.1:18091/webhook"}, + {name: "external blocked", rawURL: "https://receiver.example.test/webhook", wantErr: true}, + {name: "external explicit", rawURL: "https://receiver.example.test/webhook", allowExternal: true}, + {name: "relative rejected", rawURL: "/webhook", wantErr: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := requireAPIWebhookFixtureURLAllowed(tt.rawURL, tt.allowExternal) + if tt.wantErr && err == nil { + t.Fatal("requireAPIWebhookFixtureURLAllowed() error = nil, want error") + } + if !tt.wantErr && err != nil { + t.Fatalf("requireAPIWebhookFixtureURLAllowed() error = %v", err) + } + }) + } +} diff --git a/cmd/jetmon2/api_cli_sites.go b/cmd/jetmon2/api_cli_sites.go new file mode 100644 index 00000000..a7489018 --- /dev/null +++ b/cmd/jetmon2/api_cli_sites.go @@ -0,0 +1,538 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "flag" + "fmt" + "io" + "net/http" + "net/url" + "sort" + "strconv" + "strings" +) + +type apiSitesListFilters struct { + cursor string + limit int + state string + stateIn string + severityGTE int + monitorActive string + q string +} + +type apiSiteCreateOptions struct { + blogID int64 + monitorURL string + monitorActive apiOptionalBoolFlag + bucketNo apiOptionalIntFlag + checkKeyword apiOptionalStringFlag + forbiddenKeyword apiOptionalStringFlag + forbiddenKeywords apiStringSliceFlags + redirectPolicy apiOptionalStringFlag + requestMethod apiOptionalStringFlag + detectionProfile apiOptionalStringFlag + timeoutSeconds apiOptionalIntFlag + customHeaders apiStringMapFlags + alertCooldownMinutes apiOptionalIntFlag + checkInterval apiOptionalIntFlag +} + +type apiSiteUpdateOptions struct { + monitorURL apiOptionalStringFlag + monitorActive apiOptionalBoolFlag + bucketNo apiOptionalIntFlag + checkKeyword apiOptionalStringFlag + forbiddenKeyword apiOptionalStringFlag + forbiddenKeywords apiStringSliceFlags + clearForbiddenKeywords bool + redirectPolicy apiOptionalStringFlag + requestMethod apiOptionalStringFlag + detectionProfile apiOptionalStringFlag + timeoutSeconds apiOptionalIntFlag + customHeaders apiStringMapFlags + clearCustomHeaders bool + alertCooldownMinutes apiOptionalIntFlag + checkInterval apiOptionalIntFlag + maintenanceStart apiOptionalStringFlag + maintenanceEnd apiOptionalStringFlag +} + +type apiSiteCreateRequest struct { + BlogID int64 `json:"blog_id"` + MonitorURL string `json:"monitor_url"` + MonitorActive *bool `json:"monitor_active,omitempty"` + BucketNo *int `json:"bucket_no,omitempty"` + CheckKeyword *string `json:"check_keyword,omitempty"` + ForbiddenKeyword *string `json:"forbidden_keyword,omitempty"` + ForbiddenKeywords *[]string `json:"forbidden_keywords,omitempty"` + RedirectPolicy *string `json:"redirect_policy,omitempty"` + RequestMethod *string `json:"request_method,omitempty"` + DetectionProfile *string `json:"detection_profile,omitempty"` + TimeoutSeconds *int `json:"timeout_seconds,omitempty"` + CustomHeaders *map[string]string `json:"custom_headers,omitempty"` + AlertCooldownMinutes *int `json:"alert_cooldown_minutes,omitempty"` + CheckInterval *int `json:"check_interval,omitempty"` +} + +type apiSiteUpdateRequest struct { + MonitorURL *string `json:"monitor_url,omitempty"` + MonitorActive *bool `json:"monitor_active,omitempty"` + BucketNo *int `json:"bucket_no,omitempty"` + CheckKeyword *string `json:"check_keyword,omitempty"` + ForbiddenKeyword *string `json:"forbidden_keyword,omitempty"` + ForbiddenKeywords *[]string `json:"forbidden_keywords,omitempty"` + RedirectPolicy *string `json:"redirect_policy,omitempty"` + RequestMethod *string `json:"request_method,omitempty"` + DetectionProfile *string `json:"detection_profile,omitempty"` + TimeoutSeconds *int `json:"timeout_seconds,omitempty"` + CustomHeaders *map[string]string `json:"custom_headers,omitempty"` + AlertCooldownMinutes *int `json:"alert_cooldown_minutes,omitempty"` + CheckInterval *int `json:"check_interval,omitempty"` + MaintenanceStart *string `json:"maintenance_start,omitempty"` + MaintenanceEnd *string `json:"maintenance_end,omitempty"` +} + +func cmdAPISites(args []string) error { + if len(args) == 0 { + return errors.New("usage: jetmon2 api sites [flags]") + } + + sub := args[0] + rest := args[1:] + switch sub { + case "list": + return cmdAPISitesList(rest) + case "get": + return cmdAPISitesGet(rest) + case "create": + return cmdAPISitesCreate(rest) + case "update": + return cmdAPISitesUpdate(rest) + case "delete": + return cmdAPISitesDelete(rest) + case "pause": + return cmdAPISitesPostAction(rest, "pause", "pause") + case "resume": + return cmdAPISitesPostAction(rest, "resume", "resume") + case "trigger-now": + return cmdAPISitesPostAction(rest, "trigger-now", "trigger-now") + case "bulk-add": + return cmdAPISitesBulkAdd(rest) + case "cleanup": + return cmdAPISitesCleanup(rest) + case "simulate-failure": + return cmdAPISitesSimulateFailure(rest) + default: + return fmt.Errorf("unknown api sites subcommand %q (want: list, get, create, update, delete, pause, resume, trigger-now, bulk-add, cleanup, simulate-failure)", sub) + } +} + +func printAPISitesUsage(w io.Writer) { + fmt.Fprintln(w, "usage: jetmon2 api sites [flags]") +} + +func cmdAPISitesList(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api sites list", &opts) + filters := apiSitesListFilters{severityGTE: -1} + fs.StringVar(&filters.cursor, "cursor", "", "pagination cursor") + fs.IntVar(&filters.limit, "limit", 0, "page size (1-200)") + fs.StringVar(&filters.state, "state", "", "filter by current state") + fs.StringVar(&filters.stateIn, "state-in", "", "comma-separated current states") + fs.IntVar(&filters.severityGTE, "severity-gte", -1, "minimum current severity") + fs.StringVar(&filters.monitorActive, "monitor-active", "", "filter active sites: true or false") + fs.StringVar(&filters.q, "q", "", "monitor URL substring search") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return errors.New("usage: jetmon2 api sites list [flags]") + } + target, err := apiSitesListPath(filters) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, target, nil) +} + +func cmdAPISitesGet(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api sites get", &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api sites get [flags] ") + } + target, err := apiSiteResourcePath(fs.Arg(0), "") + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, target, nil) +} + +func cmdAPISitesCreate(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api sites create", &opts) + addAPIIdempotencyFlag(fs, &opts) + create := apiSiteCreateOptions{} + fs.Int64Var(&create.blogID, "blog-id", 0, "site blog_id") + fs.StringVar(&create.monitorURL, "url", "", "site monitor URL") + fs.Var(&create.monitorActive, "monitor-active", "monitoring enabled: true or false") + fs.Var(&create.bucketNo, "bucket-no", "bucket number") + fs.Var(&create.checkKeyword, "check-keyword", "keyword required in response body") + fs.Var(&create.forbiddenKeyword, "forbidden-keyword", "keyword forbidden in response body") + fs.Var(&create.forbiddenKeywords, "forbidden-keyword-list", "additional forbidden body keyword (repeatable or comma-separated)") + fs.Var(&create.redirectPolicy, "redirect-policy", "redirect policy: follow, alert, or fail") + fs.Var(&create.requestMethod, "request-method", "HTTP check method: HEAD or GET") + fs.Var(&create.detectionProfile, "detection-profile", "detection profile: legacy, simple_http, or full") + fs.Var(&create.timeoutSeconds, "timeout-seconds", "per-site timeout in seconds") + fs.Var(&create.customHeaders, "custom-header", "site custom header in Name: Value form (repeatable)") + fs.Var(&create.alertCooldownMinutes, "alert-cooldown-minutes", "per-site alert cooldown in minutes") + fs.Var(&create.checkInterval, "check-interval", "check interval in minutes") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return errors.New("usage: jetmon2 api sites create [flags]") + } + body, err := marshalAPISiteCreateBody(create) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodPost, "/api/v1/sites", body) +} + +func cmdAPISitesUpdate(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api sites update", &opts) + update := apiSiteUpdateOptions{} + fs.Var(&update.monitorURL, "url", "site monitor URL") + fs.Var(&update.monitorActive, "monitor-active", "monitoring enabled: true or false") + fs.Var(&update.bucketNo, "bucket-no", "bucket number") + fs.Var(&update.checkKeyword, "check-keyword", "keyword required in response body; empty clears it") + fs.Var(&update.forbiddenKeyword, "forbidden-keyword", "keyword forbidden in response body; empty clears it") + fs.Var(&update.forbiddenKeywords, "forbidden-keyword-list", "replacement forbidden body keyword list (repeatable or comma-separated)") + fs.BoolVar(&update.clearForbiddenKeywords, "clear-forbidden-keywords", false, "clear the forbidden body keyword list") + fs.Var(&update.redirectPolicy, "redirect-policy", "redirect policy: follow, alert, or fail") + fs.Var(&update.requestMethod, "request-method", "HTTP check method: HEAD or GET; empty inherits default") + fs.Var(&update.detectionProfile, "detection-profile", "detection profile: legacy, simple_http, or full; empty inherits default") + fs.Var(&update.timeoutSeconds, "timeout-seconds", "per-site timeout in seconds") + fs.Var(&update.customHeaders, "custom-header", "site custom header in Name: Value form (repeatable)") + fs.BoolVar(&update.clearCustomHeaders, "clear-custom-headers", false, "clear all site custom headers") + fs.Var(&update.alertCooldownMinutes, "alert-cooldown-minutes", "per-site alert cooldown in minutes") + fs.Var(&update.checkInterval, "check-interval", "check interval in minutes") + fs.Var(&update.maintenanceStart, "maintenance-start", "maintenance start RFC3339 timestamp; empty clears it") + fs.Var(&update.maintenanceEnd, "maintenance-end", "maintenance end RFC3339 timestamp; empty clears it") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api sites update [flags] ") + } + target, err := apiSiteResourcePath(fs.Arg(0), "") + if err != nil { + return err + } + body, err := marshalAPISiteUpdateBody(update) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodPatch, target, body) +} + +func cmdAPISitesDelete(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api sites delete", &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api sites delete [flags] ") + } + target, err := apiSiteResourcePath(fs.Arg(0), "") + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodDelete, target, nil) +} + +func cmdAPISitesPostAction(args []string, usageName, suffix string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api sites "+usageName, &opts) + addAPIIdempotencyFlag(fs, &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return fmt.Errorf("usage: jetmon2 api sites %s [flags] ", usageName) + } + target, err := apiSiteResourcePath(fs.Arg(0), suffix) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodPost, target, nil) +} + +func addAPIIdempotencyFlag(fs *flag.FlagSet, opts *apiCLIOptions) { + fs.StringVar(&opts.idempotencyKey, "idempotency-key", "", "Idempotency-Key header for POST retries") +} + +func apiSitesListPath(filters apiSitesListFilters) (string, error) { + if filters.limit < 0 { + return "", errors.New("limit must be positive") + } + if filters.severityGTE < -1 { + return "", errors.New("severity-gte must be zero or greater") + } + if filters.state != "" && filters.stateIn != "" { + return "", errors.New("use --state or --state-in, not both") + } + + values := url.Values{} + if filters.cursor != "" { + values.Set("cursor", filters.cursor) + } + if filters.limit > 0 { + values.Set("limit", strconv.Itoa(filters.limit)) + } + if filters.state != "" { + values.Set("state", filters.state) + } + if filters.stateIn != "" { + values.Set("state__in", filters.stateIn) + } + if filters.severityGTE >= 0 { + values.Set("severity__gte", strconv.Itoa(filters.severityGTE)) + } + if strings.TrimSpace(filters.monitorActive) != "" { + active, err := strconv.ParseBool(filters.monitorActive) + if err != nil { + return "", errors.New("monitor-active must be true or false") + } + values.Set("monitor_active", strconv.FormatBool(active)) + } + if filters.q != "" { + values.Set("q", filters.q) + } + + if len(values) == 0 { + return "/api/v1/sites", nil + } + return "/api/v1/sites?" + values.Encode(), nil +} + +func apiSiteResourcePath(rawID, suffix string) (string, error) { + id, err := strconv.ParseInt(rawID, 10, 64) + if err != nil || id <= 0 { + return "", errors.New("site id must be a positive integer") + } + path := "/api/v1/sites/" + strconv.FormatInt(id, 10) + if suffix != "" { + path += "/" + strings.TrimPrefix(suffix, "/") + } + return path, nil +} + +func marshalAPISiteCreateBody(opts apiSiteCreateOptions) ([]byte, error) { + if opts.blogID <= 0 { + return nil, errors.New("blog-id is required and must be a positive integer") + } + if strings.TrimSpace(opts.monitorURL) == "" { + return nil, errors.New("url is required") + } + + req := apiSiteCreateRequest{ + BlogID: opts.blogID, + MonitorURL: opts.monitorURL, + MonitorActive: opts.monitorActive.ptr(), + BucketNo: opts.bucketNo.ptr(), + CheckKeyword: opts.checkKeyword.ptr(), + ForbiddenKeyword: opts.forbiddenKeyword.ptr(), + ForbiddenKeywords: opts.forbiddenKeywords.ptr(), + RedirectPolicy: opts.redirectPolicy.ptr(), + RequestMethod: opts.requestMethod.ptr(), + DetectionProfile: opts.detectionProfile.ptr(), + TimeoutSeconds: opts.timeoutSeconds.ptr(), + CustomHeaders: opts.customHeaders.ptr(), + AlertCooldownMinutes: opts.alertCooldownMinutes.ptr(), + CheckInterval: opts.checkInterval.ptr(), + } + return json.Marshal(req) +} + +func marshalAPISiteUpdateBody(opts apiSiteUpdateOptions) ([]byte, error) { + if opts.clearCustomHeaders && opts.customHeaders.set { + return nil, errors.New("use --custom-header or --clear-custom-headers, not both") + } + if opts.clearForbiddenKeywords && opts.forbiddenKeywords.set { + return nil, errors.New("use --forbidden-keyword-list or --clear-forbidden-keywords, not both") + } + + req := apiSiteUpdateRequest{ + MonitorURL: opts.monitorURL.ptr(), + MonitorActive: opts.monitorActive.ptr(), + BucketNo: opts.bucketNo.ptr(), + CheckKeyword: opts.checkKeyword.ptr(), + ForbiddenKeyword: opts.forbiddenKeyword.ptr(), + ForbiddenKeywords: opts.forbiddenKeywords.ptr(), + RedirectPolicy: opts.redirectPolicy.ptr(), + RequestMethod: opts.requestMethod.ptr(), + DetectionProfile: opts.detectionProfile.ptr(), + TimeoutSeconds: opts.timeoutSeconds.ptr(), + CustomHeaders: opts.customHeaders.ptr(), + AlertCooldownMinutes: opts.alertCooldownMinutes.ptr(), + CheckInterval: opts.checkInterval.ptr(), + MaintenanceStart: opts.maintenanceStart.ptr(), + MaintenanceEnd: opts.maintenanceEnd.ptr(), + } + if opts.clearCustomHeaders { + empty := map[string]string{} + req.CustomHeaders = &empty + } + if opts.clearForbiddenKeywords { + empty := []string{} + req.ForbiddenKeywords = &empty + } + return json.Marshal(req) +} + +type apiOptionalBoolFlag struct { + value bool + set bool +} + +func (f *apiOptionalBoolFlag) Set(v string) error { + parsed, err := strconv.ParseBool(v) + if err != nil { + return err + } + f.value = parsed + f.set = true + return nil +} + +func (f *apiOptionalBoolFlag) String() string { + if !f.set { + return "" + } + return strconv.FormatBool(f.value) +} + +func (f *apiOptionalBoolFlag) IsBoolFlag() bool { + return true +} + +func (f apiOptionalBoolFlag) ptr() *bool { + if !f.set { + return nil + } + v := f.value + return &v +} + +type apiOptionalIntFlag struct { + value int + set bool +} + +func (f *apiOptionalIntFlag) Set(v string) error { + parsed, err := strconv.Atoi(v) + if err != nil { + return err + } + f.value = parsed + f.set = true + return nil +} + +func (f *apiOptionalIntFlag) String() string { + if !f.set { + return "" + } + return strconv.Itoa(f.value) +} + +func (f apiOptionalIntFlag) ptr() *int { + if !f.set { + return nil + } + v := f.value + return &v +} + +type apiOptionalStringFlag struct { + value string + set bool +} + +func (f *apiOptionalStringFlag) Set(v string) error { + f.value = v + f.set = true + return nil +} + +func (f *apiOptionalStringFlag) String() string { + return f.value +} + +func (f apiOptionalStringFlag) ptr() *string { + if !f.set { + return nil + } + v := f.value + return &v +} + +type apiStringMapFlags struct { + values map[string]string + set bool +} + +func (f *apiStringMapFlags) Set(v string) error { + name, value, ok := strings.Cut(v, ":") + if !ok { + return fmt.Errorf("custom header %q must be in Name: Value form", v) + } + name = strings.TrimSpace(name) + if name == "" { + return errors.New("custom header name must not be empty") + } + if f.values == nil { + f.values = map[string]string{} + } + f.values[name] = strings.TrimSpace(value) + f.set = true + return nil +} + +func (f *apiStringMapFlags) String() string { + if !f.set { + return "" + } + keys := make([]string, 0, len(f.values)) + for k := range f.values { + keys = append(keys, k) + } + sort.Strings(keys) + parts := make([]string, 0, len(keys)) + for _, k := range keys { + parts = append(parts, k+": "+f.values[k]) + } + return strings.Join(parts, ", ") +} + +func (f apiStringMapFlags) ptr() *map[string]string { + if !f.set { + return nil + } + values := make(map[string]string, len(f.values)) + for k, v := range f.values { + values[k] = v + } + return &values +} diff --git a/cmd/jetmon2/api_cli_sites_bulk.go b/cmd/jetmon2/api_cli_sites_bulk.go new file mode 100644 index 00000000..f045bc73 --- /dev/null +++ b/cmd/jetmon2/api_cli_sites_bulk.go @@ -0,0 +1,416 @@ +package main + +import ( + "bytes" + "context" + "encoding/csv" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "os" + "strconv" + "strings" +) + +const ( + apiSitesBulkAddMaxCount = 200 + defaultAPIBulkAddBlogIDStart = int64(900000000) +) + +type apiSitesBulkAddOptions struct { + count int + batch string + source string + file string + blogIDStart int64 + dryRun bool + idempotencyKeyPrefix string + monitorActive apiOptionalBoolFlag +} + +type apiBulkSiteEntry struct { + MonitorURL string `json:"monitor_url"` + CheckKeyword *string `json:"check_keyword,omitempty"` + ForbiddenKeyword *string `json:"forbidden_keyword,omitempty"` + ForbiddenKeywords []string `json:"forbidden_keywords,omitempty"` + RedirectPolicy *string `json:"redirect_policy,omitempty"` + RequestMethod *string `json:"request_method,omitempty"` + DetectionProfile *string `json:"detection_profile,omitempty"` + TimeoutSeconds *int `json:"timeout_seconds,omitempty"` + CustomHeaders map[string]string `json:"custom_headers,omitempty"` + AlertCooldownMinutes *int `json:"alert_cooldown_minutes,omitempty"` + CheckInterval *int `json:"check_interval,omitempty"` +} + +type apiSitesBulkAddOutput struct { + DryRun bool `json:"dry_run,omitempty"` + Count int `json:"count"` + Sites []json.RawMessage `json:"sites,omitempty"` + Created []json.RawMessage `json:"created,omitempty"` +} + +func cmdAPISitesBulkAdd(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api sites bulk-add", &opts) + bulk := apiSitesBulkAddOptions{ + source: "fixture", + blogIDStart: defaultAPIBulkAddBlogIDStart, + } + fs.IntVar(&bulk.count, "count", 0, "number of sites to create, max 200") + fs.StringVar(&bulk.batch, "batch", "", "stable batch label; derives blog ids and stores a custom header marker") + fs.StringVar(&bulk.source, "source", bulk.source, "site source: fixture, file, or stdin") + fs.StringVar(&bulk.file, "file", "", "source file for --source file") + fs.Int64Var(&bulk.blogIDStart, "blog-id-start", bulk.blogIDStart, "first blog_id to assign") + fs.BoolVar(&bulk.dryRun, "dry-run", false, "print planned create payloads without sending requests") + fs.StringVar(&bulk.idempotencyKeyPrefix, "idempotency-key-prefix", "", "prefix for per-site Idempotency-Key headers") + fs.Var(&bulk.monitorActive, "monitor-active", "override monitor_active for every generated site") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return errors.New("usage: jetmon2 api sites bulk-add [flags]") + } + return runAPISitesBulkAdd(context.Background(), nil, opts, bulk) +} + +func runAPISitesBulkAdd(ctx context.Context, client *http.Client, opts apiCLIOptions, bulk apiSitesBulkAddOptions) error { + if opts.out == nil { + opts.out = io.Discard + } + entries, err := loadAPIBulkSiteEntries(bulk, opts.in) + if err != nil { + return err + } + planned, err := planAPIBulkSiteCreates(entries, bulk) + if err != nil { + return err + } + + if bulk.dryRun { + sites, err := marshalAPIBulkSiteRequests(planned) + if err != nil { + return err + } + return writeAPIValueOutput(opts.out, apiSitesBulkAddOutput{ + DryRun: true, + Count: len(sites), + Sites: sites, + }, opts) + } + + remote, err := requireAPILocalOrAllowRemote(opts, opts.allowRemote, "api sites bulk-add") + if err != nil { + return err + } + if remote && strings.TrimSpace(bulk.batch) == "" { + return errors.New("api sites bulk-add requires --batch when --allow-remote targets a non-local API") + } + + created := make([]json.RawMessage, 0, len(planned)) + for i, req := range planned { + body, err := json.Marshal(req) + if err != nil { + return err + } + requestOpts := opts + var response bytes.Buffer + requestOpts.out = &response + if bulk.idempotencyKeyPrefix != "" { + requestOpts.idempotencyKey = fmt.Sprintf("%s-%03d", bulk.idempotencyKeyPrefix, i+1) + } + if err := executeAPIRequest(ctx, client, requestOpts, http.MethodPost, "/api/v1/sites", body); err != nil { + if response.Len() > 0 { + _, _ = opts.out.Write(response.Bytes()) + } + return fmt.Errorf("create site %d (%s): %w", req.BlogID, req.MonitorURL, err) + } + created = append(created, json.RawMessage(bytes.TrimSpace(response.Bytes()))) + } + + return writeAPIValueOutput(opts.out, apiSitesBulkAddOutput{ + Count: len(created), + Created: created, + }, opts) +} + +func loadAPIBulkSiteEntries(opts apiSitesBulkAddOptions, in io.Reader) ([]apiBulkSiteEntry, error) { + var data []byte + var err error + switch opts.source { + case "fixture": + if opts.file != "" { + return nil, errors.New("--file is only valid with --source file") + } + data = apiCLISiteFixture + case "file": + if opts.file == "" { + return nil, errors.New("--file is required with --source file") + } + data, err = os.ReadFile(opts.file) + if err != nil { + return nil, err + } + case "stdin": + if in == nil { + return nil, errors.New("stdin source requires an input reader") + } + data, err = io.ReadAll(in) + if err != nil { + return nil, err + } + default: + return nil, errors.New("source must be one of: fixture, file, stdin") + } + return parseAPIBulkSiteEntries(data) +} + +func parseAPIBulkSiteEntries(data []byte) ([]apiBulkSiteEntry, error) { + trimmed := bytes.TrimSpace(data) + if len(trimmed) == 0 { + return nil, errors.New("site source is empty") + } + if trimmed[0] == '[' || trimmed[0] == '{' || trimmed[0] == '"' { + return parseAPIBulkJSONSiteEntries(trimmed) + } + return parseAPIBulkCSVSiteEntries(trimmed) +} + +func parseAPIBulkJSONSiteEntries(data []byte) ([]apiBulkSiteEntry, error) { + var raw []json.RawMessage + if data[0] == '[' { + if err := json.Unmarshal(data, &raw); err != nil { + return nil, err + } + } else { + raw = []json.RawMessage{data} + } + entries := make([]apiBulkSiteEntry, 0, len(raw)) + for _, item := range raw { + var entry apiBulkSiteEntry + if err := json.Unmarshal(item, &entry); err != nil { + return nil, err + } + entries = append(entries, entry) + } + return validateAPIBulkSiteEntries(entries) +} + +func parseAPIBulkCSVSiteEntries(data []byte) ([]apiBulkSiteEntry, error) { + r := csv.NewReader(bytes.NewReader(data)) + r.TrimLeadingSpace = true + r.FieldsPerRecord = -1 + records, err := r.ReadAll() + if err != nil { + return nil, err + } + if len(records) == 0 { + return nil, errors.New("site source is empty") + } + + header := apiBulkCSVHeader(records[0]) + start := 0 + if len(header) > 0 { + start = 1 + } + + entries := make([]apiBulkSiteEntry, 0, len(records)-start) + for _, record := range records[start:] { + if len(record) == 0 || strings.TrimSpace(record[0]) == "" { + continue + } + if len(header) == 0 { + entries = append(entries, apiBulkSiteEntry{MonitorURL: strings.TrimSpace(record[0])}) + continue + } + entry, err := apiBulkSiteEntryFromCSVRecord(header, record) + if err != nil { + return nil, err + } + entries = append(entries, entry) + } + return validateAPIBulkSiteEntries(entries) +} + +func apiBulkCSVHeader(record []string) map[string]int { + header := map[string]int{} + hasURL := false + for i, col := range record { + name := strings.ToLower(strings.TrimSpace(col)) + header[name] = i + if name == "monitor_url" || name == "url" { + hasURL = true + } + } + if !hasURL { + return nil + } + return header +} + +func apiBulkSiteEntryFromCSVRecord(header map[string]int, record []string) (apiBulkSiteEntry, error) { + entry := apiBulkSiteEntry{} + entry.MonitorURL = csvField(header, record, "monitor_url") + if entry.MonitorURL == "" { + entry.MonitorURL = csvField(header, record, "url") + } + if v := csvField(header, record, "check_keyword"); v != "" { + entry.CheckKeyword = &v + } + if v := csvField(header, record, "forbidden_keyword"); v != "" { + entry.ForbiddenKeyword = &v + } + if v := csvField(header, record, "forbidden_keywords"); v != "" { + entry.ForbiddenKeywords = splitAPIBulkStringList(v) + } + if v := csvField(header, record, "redirect_policy"); v != "" { + entry.RedirectPolicy = &v + } + if v := csvField(header, record, "request_method"); v != "" { + entry.RequestMethod = &v + } + if v := csvField(header, record, "detection_profile"); v != "" { + entry.DetectionProfile = &v + } + if v := csvField(header, record, "timeout_seconds"); v != "" { + parsed, err := strconv.Atoi(v) + if err != nil { + return entry, fmt.Errorf("timeout_seconds must be an integer: %w", err) + } + entry.TimeoutSeconds = &parsed + } + if v := csvField(header, record, "check_interval"); v != "" { + parsed, err := strconv.Atoi(v) + if err != nil { + return entry, fmt.Errorf("check_interval must be an integer: %w", err) + } + entry.CheckInterval = &parsed + } + return entry, nil +} + +func csvField(header map[string]int, record []string, name string) string { + idx, ok := header[name] + if !ok || idx >= len(record) { + return "" + } + return strings.TrimSpace(record[idx]) +} + +func validateAPIBulkSiteEntries(entries []apiBulkSiteEntry) ([]apiBulkSiteEntry, error) { + if len(entries) == 0 { + return nil, errors.New("no sites found in source") + } + for i := range entries { + entries[i].MonitorURL = strings.TrimSpace(entries[i].MonitorURL) + if entries[i].MonitorURL == "" { + return nil, fmt.Errorf("site source entry %d is missing monitor_url", i+1) + } + } + return entries, nil +} + +func planAPIBulkSiteCreates(entries []apiBulkSiteEntry, opts apiSitesBulkAddOptions) ([]apiSiteCreateRequest, error) { + if opts.count <= 0 { + return nil, errors.New("count is required and must be positive") + } + if opts.count > apiSitesBulkAddMaxCount { + return nil, fmt.Errorf("count must be <= %d", apiSitesBulkAddMaxCount) + } + if opts.blogIDStart <= 0 { + return nil, errors.New("blog-id-start must be a positive integer") + } + if opts.batch != "" && opts.blogIDStart == defaultAPIBulkAddBlogIDStart { + opts.blogIDStart = apiCLIBatchBlogIDStart(opts.batch) + } + if len(entries) == 0 { + return nil, errors.New("no sites found in source") + } + + out := make([]apiSiteCreateRequest, 0, opts.count) + for i := 0; i < opts.count; i++ { + entry := entries[i%len(entries)] + req := apiSiteCreateRequest{ + BlogID: opts.blogIDStart + int64(i), + MonitorURL: entry.MonitorURL, + MonitorActive: opts.monitorActive.ptr(), + CheckKeyword: entry.CheckKeyword, + ForbiddenKeyword: entry.ForbiddenKeyword, + ForbiddenKeywords: forbiddenKeywordsPtr(entry.ForbiddenKeywords), + RedirectPolicy: entry.RedirectPolicy, + RequestMethod: entry.RequestMethod, + DetectionProfile: entry.DetectionProfile, + TimeoutSeconds: entry.TimeoutSeconds, + AlertCooldownMinutes: entry.AlertCooldownMinutes, + CheckInterval: entry.CheckInterval, + } + if len(entry.CustomHeaders) > 0 || opts.batch != "" { + headers := make(map[string]string, len(entry.CustomHeaders)+1) + for k, v := range entry.CustomHeaders { + headers[k] = v + } + if opts.batch != "" { + headers[apiCLIBatchHeader] = opts.batch + } + req.CustomHeaders = &headers + } + out = append(out, req) + } + return out, nil +} + +func marshalAPIBulkSiteRequests(requests []apiSiteCreateRequest) ([]json.RawMessage, error) { + out := make([]json.RawMessage, 0, len(requests)) + for _, req := range requests { + b, err := json.Marshal(req) + if err != nil { + return nil, err + } + out = append(out, json.RawMessage(b)) + } + return out, nil +} + +func splitAPIBulkStringList(raw string) []string { + parts := strings.Split(raw, ",") + out := make([]string, 0, len(parts)) + for _, part := range parts { + part = strings.TrimSpace(part) + if part != "" { + out = append(out, part) + } + } + return out +} + +func forbiddenKeywordsPtr(values []string) *[]string { + if len(values) == 0 { + return nil + } + out := make([]string, len(values)) + copy(out, values) + return &out +} + +func (e *apiBulkSiteEntry) UnmarshalJSON(data []byte) error { + var urlOnly string + if err := json.Unmarshal(data, &urlOnly); err == nil { + e.MonitorURL = urlOnly + return nil + } + + type bulkSiteEntry apiBulkSiteEntry + var aux struct { + bulkSiteEntry + URL string `json:"url"` + } + if err := json.Unmarshal(data, &aux); err != nil { + return err + } + *e = apiBulkSiteEntry(aux.bulkSiteEntry) + if e.MonitorURL == "" { + e.MonitorURL = aux.URL + } + return nil +} diff --git a/cmd/jetmon2/api_cli_sites_bulk_test.go b/cmd/jetmon2/api_cli_sites_bulk_test.go new file mode 100644 index 00000000..f569c8d1 --- /dev/null +++ b/cmd/jetmon2/api_cli_sites_bulk_test.go @@ -0,0 +1,182 @@ +package main + +import ( + "encoding/json" + "strings" + "testing" +) + +func TestParseAPIBulkJSONSiteEntries(t *testing.T) { + entries, err := parseAPIBulkSiteEntries([]byte(`[ + "https://example.com/", + {"url":"https://wordpress.com/","check_keyword":"WordPress","forbidden_keyword":"database error","forbidden_keywords":["metrics.evil-cdn.example/collect.js","buy cheap viagra"],"redirect_policy":"follow","timeout_seconds":5} + ]`)) + if err != nil { + t.Fatalf("parseAPIBulkSiteEntries() error = %v", err) + } + if len(entries) != 2 { + t.Fatalf("len(entries) = %d, want 2", len(entries)) + } + if entries[0].MonitorURL != "https://example.com/" { + t.Fatalf("first URL = %q", entries[0].MonitorURL) + } + if entries[1].MonitorURL != "https://wordpress.com/" { + t.Fatalf("second URL = %q", entries[1].MonitorURL) + } + if entries[1].CheckKeyword == nil || *entries[1].CheckKeyword != "WordPress" { + t.Fatalf("check_keyword = %#v, want WordPress", entries[1].CheckKeyword) + } + if entries[1].ForbiddenKeyword == nil || *entries[1].ForbiddenKeyword != "database error" { + t.Fatalf("forbidden_keyword = %#v, want database error", entries[1].ForbiddenKeyword) + } + if len(entries[1].ForbiddenKeywords) != 2 || entries[1].ForbiddenKeywords[0] != "metrics.evil-cdn.example/collect.js" { + t.Fatalf("forbidden_keywords = %#v", entries[1].ForbiddenKeywords) + } + if entries[1].TimeoutSeconds == nil || *entries[1].TimeoutSeconds != 5 { + t.Fatalf("timeout_seconds = %#v, want 5", entries[1].TimeoutSeconds) + } +} + +func TestParseAPIBulkCSVSiteEntries(t *testing.T) { + source := strings.NewReader("monitor_url,check_keyword,forbidden_keyword,forbidden_keywords,redirect_policy,check_interval\nhttps://example.com/,Example Domain,database error,\"metrics.evil-cdn.example/collect.js,buy cheap viagra\",follow,5\n") + entries, err := loadAPIBulkSiteEntries(apiSitesBulkAddOptions{source: "stdin"}, source) + if err != nil { + t.Fatalf("loadAPIBulkSiteEntries() error = %v", err) + } + if len(entries) != 1 { + t.Fatalf("len(entries) = %d, want 1", len(entries)) + } + if entries[0].MonitorURL != "https://example.com/" { + t.Fatalf("monitor_url = %q", entries[0].MonitorURL) + } + if entries[0].CheckKeyword == nil || *entries[0].CheckKeyword != "Example Domain" { + t.Fatalf("check_keyword = %#v, want Example Domain", entries[0].CheckKeyword) + } + if entries[0].ForbiddenKeyword == nil || *entries[0].ForbiddenKeyword != "database error" { + t.Fatalf("forbidden_keyword = %#v, want database error", entries[0].ForbiddenKeyword) + } + if len(entries[0].ForbiddenKeywords) != 2 || entries[0].ForbiddenKeywords[1] != "buy cheap viagra" { + t.Fatalf("forbidden_keywords = %#v", entries[0].ForbiddenKeywords) + } + if entries[0].CheckInterval == nil || *entries[0].CheckInterval != 5 { + t.Fatalf("check_interval = %#v, want 5", entries[0].CheckInterval) + } +} + +func TestParseAPIBulkNewlineSiteEntries(t *testing.T) { + entries, err := parseAPIBulkSiteEntries([]byte("https://example.com/\nhttps://wordpress.com/\n")) + if err != nil { + t.Fatalf("parseAPIBulkSiteEntries() error = %v", err) + } + if len(entries) != 2 { + t.Fatalf("len(entries) = %d, want 2", len(entries)) + } + if entries[1].MonitorURL != "https://wordpress.com/" { + t.Fatalf("second URL = %q", entries[1].MonitorURL) + } +} + +func TestPlanAPIBulkSiteCreatesCyclesFixtureEntries(t *testing.T) { + var active apiOptionalBoolFlag + setTestFlag(t, &active, "false") + forbidden := "database error" + forbiddenKeywords := []string{"metrics.evil-cdn.example/collect.js", "buy cheap viagra"} + entries := []apiBulkSiteEntry{ + {MonitorURL: "https://example.com/", ForbiddenKeyword: &forbidden, ForbiddenKeywords: forbiddenKeywords}, + {MonitorURL: "https://wordpress.com/"}, + } + planned, err := planAPIBulkSiteCreates(entries, apiSitesBulkAddOptions{ + count: 3, + blogIDStart: 900, + monitorActive: active, + }) + if err != nil { + t.Fatalf("planAPIBulkSiteCreates() error = %v", err) + } + if len(planned) != 3 { + t.Fatalf("len(planned) = %d, want 3", len(planned)) + } + if planned[0].BlogID != 900 || planned[2].BlogID != 902 { + t.Fatalf("blog ids = %d, %d; want 900, 902", planned[0].BlogID, planned[2].BlogID) + } + if planned[2].MonitorURL != "https://example.com/" { + t.Fatalf("cycled URL = %q, want first source URL", planned[2].MonitorURL) + } + if planned[2].ForbiddenKeyword == nil || *planned[2].ForbiddenKeyword != "database error" { + t.Fatalf("cycled forbidden_keyword = %#v, want database error", planned[2].ForbiddenKeyword) + } + if planned[2].ForbiddenKeywords == nil || len(*planned[2].ForbiddenKeywords) != 2 { + t.Fatalf("cycled forbidden_keywords = %#v, want two values", planned[2].ForbiddenKeywords) + } + if planned[0].MonitorActive == nil || *planned[0].MonitorActive { + t.Fatalf("monitor_active = %#v, want false", planned[0].MonitorActive) + } +} + +func TestPlanAPIBulkSiteCreatesUsesBatchMarker(t *testing.T) { + entries := []apiBulkSiteEntry{{MonitorURL: "https://example.com/"}} + planned, err := planAPIBulkSiteCreates(entries, apiSitesBulkAddOptions{ + count: 1, + batch: "batch-a", + blogIDStart: defaultAPIBulkAddBlogIDStart, + }) + if err != nil { + t.Fatalf("planAPIBulkSiteCreates() error = %v", err) + } + if planned[0].BlogID != apiCLIBatchBlogIDStart("batch-a") { + t.Fatalf("blog_id = %d, want batch-derived id", planned[0].BlogID) + } + if planned[0].CustomHeaders == nil || (*planned[0].CustomHeaders)[apiCLIBatchHeader] != "batch-a" { + t.Fatalf("custom headers = %#v, want batch marker", planned[0].CustomHeaders) + } +} + +func TestPlanAPIBulkSiteCreatesRejectsUnboundedCount(t *testing.T) { + _, err := planAPIBulkSiteCreates([]apiBulkSiteEntry{{MonitorURL: "https://example.com/"}}, apiSitesBulkAddOptions{ + count: apiSitesBulkAddMaxCount + 1, + blogIDStart: 900, + }) + if err == nil { + t.Fatal("planAPIBulkSiteCreates() error = nil, want max count error") + } +} + +func TestLoadAPIBulkFixture(t *testing.T) { + entries, err := loadAPIBulkSiteEntries(apiSitesBulkAddOptions{source: "fixture"}, nil) + if err != nil { + t.Fatalf("load fixture error = %v", err) + } + if len(entries) < 8 { + t.Fatalf("fixture entries = %d, want at least 8", len(entries)) + } +} + +func TestMarshalAPIBulkSiteRequests(t *testing.T) { + keyword := "Example Domain" + forbidden := "database error" + requests := []apiSiteCreateRequest{{ + BlogID: 900, + MonitorURL: "https://example.com/", + CheckKeyword: &keyword, + ForbiddenKeyword: &forbidden, + ForbiddenKeywords: &[]string{"metrics.evil-cdn.example/collect.js", "buy cheap viagra"}, + }} + raw, err := marshalAPIBulkSiteRequests(requests) + if err != nil { + t.Fatalf("marshalAPIBulkSiteRequests() error = %v", err) + } + var got map[string]any + if err := json.Unmarshal(raw[0], &got); err != nil { + t.Fatalf("unmarshal request: %v", err) + } + if got["blog_id"] != float64(900) { + t.Fatalf("blog_id = %#v, want 900", got["blog_id"]) + } + if got["check_keyword"] != "Example Domain" { + t.Fatalf("check_keyword = %#v, want Example Domain", got["check_keyword"]) + } + if got["forbidden_keyword"] != "database error" { + t.Fatalf("forbidden_keyword = %#v, want database error", got["forbidden_keyword"]) + } + assertStringArray(t, got["forbidden_keywords"], []string{"metrics.evil-cdn.example/collect.js", "buy cheap viagra"}) +} diff --git a/cmd/jetmon2/api_cli_sites_cleanup.go b/cmd/jetmon2/api_cli_sites_cleanup.go new file mode 100644 index 00000000..e371efd0 --- /dev/null +++ b/cmd/jetmon2/api_cli_sites_cleanup.go @@ -0,0 +1,214 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "strconv" + "strings" +) + +type apiSitesCleanupOptions struct { + batch string + siteIDs apiInt64SliceFlags + count int + blogIDStart int64 + dryRun bool + ignoreNotFound bool + allowUnmarked bool +} + +type apiSitesCleanupSummary struct { + DryRun bool `json:"dry_run,omitempty"` + Batch string `json:"batch,omitempty"` + Count int `json:"count"` + Sites []apiSitesCleanupResult `json:"sites"` +} + +type apiSitesCleanupResult struct { + SiteID int64 `json:"site_id"` + Status string `json:"status"` + Error string `json:"error,omitempty"` +} + +func cmdAPISitesCleanup(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api sites cleanup", &opts) + cleanup := apiSitesCleanupOptions{ + count: apiSitesBulkAddMaxCount, + ignoreNotFound: true, + } + fs.StringVar(&cleanup.batch, "batch", "", "batch label whose deterministic site ids should be deleted") + fs.Var(&cleanup.siteIDs, "site-id", "explicit site id to delete (repeatable or comma-separated)") + fs.IntVar(&cleanup.count, "count", cleanup.count, "number of batch-derived site ids to delete, max 200") + fs.Int64Var(&cleanup.blogIDStart, "blog-id-start", 0, "first batch blog_id; default derives from --batch") + fs.BoolVar(&cleanup.dryRun, "dry-run", false, "print the planned deletes without sending requests") + fs.BoolVar(&cleanup.ignoreNotFound, "ignore-not-found", cleanup.ignoreNotFound, "treat 404 responses as already cleaned") + fs.BoolVar(&cleanup.allowUnmarked, "allow-unmarked", false, "allow cleanup of --batch targets that do not expose the matching CLI batch marker") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return errors.New("usage: jetmon2 api sites cleanup [flags]") + } + return runAPISitesCleanup(context.Background(), nil, opts, cleanup) +} + +func runAPISitesCleanup(ctx context.Context, client *http.Client, opts apiCLIOptions, cleanup apiSitesCleanupOptions) error { + if opts.out == nil { + opts.out = io.Discard + } + siteIDs, err := apiCleanupSiteIDs(cleanup) + if err != nil { + return err + } + if !cleanup.dryRun { + remote, err := requireAPILocalOrAllowRemote(opts, opts.allowRemote, "api sites cleanup") + if err != nil { + return err + } + if remote { + if strings.TrimSpace(cleanup.batch) == "" { + return errors.New("api sites cleanup requires --batch when --allow-remote targets a non-local API") + } + if cleanup.allowUnmarked { + return errors.New("api sites cleanup cannot use --allow-unmarked with --allow-remote") + } + } + } + + summary := apiSitesCleanupSummary{ + DryRun: cleanup.dryRun, + Batch: cleanup.batch, + Count: len(siteIDs), + Sites: make([]apiSitesCleanupResult, 0, len(siteIDs)), + } + for _, siteID := range siteIDs { + result := apiSitesCleanupResult{SiteID: siteID} + if cleanup.dryRun { + result.Status = "would_delete" + summary.Sites = append(summary.Sites, result) + continue + } + if cleanup.batch != "" && !cleanup.allowUnmarked { + ok, exists, err := apiSiteBelongsToBatch(ctx, client, opts, siteID, cleanup.batch) + if err != nil { + result.Status = "failed" + result.Error = err.Error() + summary.Sites = append(summary.Sites, result) + _ = writeAPIValueOutput(opts.out, summary, opts) + return fmt.Errorf("verify site %d batch marker: %w", siteID, err) + } + if !exists && cleanup.ignoreNotFound { + result.Status = "not_found" + summary.Sites = append(summary.Sites, result) + continue + } + if !exists { + result.Status = "failed" + result.Error = "site not found" + summary.Sites = append(summary.Sites, result) + _ = writeAPIValueOutput(opts.out, summary, opts) + return fmt.Errorf("site %d not found", siteID) + } + if !ok { + result.Status = "skipped_unmatched_batch" + result.Error = fmt.Sprintf("site does not expose cli_batch %q", cleanup.batch) + summary.Sites = append(summary.Sites, result) + _ = writeAPIValueOutput(opts.out, summary, opts) + return fmt.Errorf("site %d does not belong to CLI batch %q", siteID, cleanup.batch) + } + } + resp, err := doAPIRequest(ctx, client, opts, http.MethodDelete, "/api/v1/sites/"+strconv.FormatInt(siteID, 10), nil) + if err != nil { + result.Status = "failed" + result.Error = err.Error() + summary.Sites = append(summary.Sites, result) + _ = writeAPIValueOutput(opts.out, summary, opts) + return fmt.Errorf("delete site %d: %w", siteID, err) + } + switch { + case resp.StatusCode == http.StatusNotFound && cleanup.ignoreNotFound: + result.Status = "not_found" + case resp.StatusCode >= 400: + result.Status = "failed" + result.Error = strings.TrimSpace(string(resp.Body)) + if result.Error == "" { + result.Error = resp.Status + } + summary.Sites = append(summary.Sites, result) + _ = writeAPIValueOutput(opts.out, summary, opts) + return fmt.Errorf("delete site %d returned %s", siteID, resp.Status) + default: + result.Status = "deleted" + } + summary.Sites = append(summary.Sites, result) + } + return writeAPIValueOutput(opts.out, summary, opts) +} + +func apiCleanupSiteIDs(cleanup apiSitesCleanupOptions) ([]int64, error) { + if cleanup.siteIDs.set { + return cleanup.siteIDs.valuesOrEmpty(), nil + } + if cleanup.batch == "" && cleanup.blogIDStart == 0 { + return nil, errors.New("use --batch, --blog-id-start, or --site-id") + } + if cleanup.count <= 0 { + return nil, errors.New("count must be positive") + } + if cleanup.count > apiSitesBulkAddMaxCount { + return nil, fmt.Errorf("count must be <= %d", apiSitesBulkAddMaxCount) + } + start := cleanup.blogIDStart + if start == 0 { + start = apiCLIBatchBlogIDStart(cleanup.batch) + } + if start <= 0 { + return nil, errors.New("blog-id-start must be positive") + } + ids := make([]int64, 0, cleanup.count) + for i := 0; i < cleanup.count; i++ { + ids = append(ids, start+int64(i)) + } + return ids, nil +} + +func apiSiteBelongsToBatch(ctx context.Context, client *http.Client, opts apiCLIOptions, siteID int64, batch string) (bool, bool, error) { + resp, err := doAPIRequest(ctx, client, opts, http.MethodGet, apiSitePathWithCLIMetadata(siteID), nil) + if err != nil { + return false, false, err + } + if resp.StatusCode == http.StatusNotFound { + return false, false, nil + } + if resp.StatusCode >= 400 { + body := strings.TrimSpace(string(resp.Body)) + if body == "" { + body = resp.Status + } + return false, true, fmt.Errorf("site lookup returned %s: %s", resp.Status, body) + } + siteBatch, err := apiSiteCLIBatch(resp.Body) + if err != nil { + return false, true, err + } + return siteBatch == batch, true, nil +} + +func apiSitePathWithCLIMetadata(siteID int64) string { + return "/api/v1/sites/" + strconv.FormatInt(siteID, 10) + "?include_cli_metadata=true" +} + +func apiSiteCLIBatch(body []byte) (string, error) { + var site struct { + CLIBatch string `json:"cli_batch"` + } + if err := json.Unmarshal(body, &site); err != nil { + return "", err + } + return site.CLIBatch, nil +} diff --git a/cmd/jetmon2/api_cli_sites_cleanup_test.go b/cmd/jetmon2/api_cli_sites_cleanup_test.go new file mode 100644 index 00000000..04ac57fc --- /dev/null +++ b/cmd/jetmon2/api_cli_sites_cleanup_test.go @@ -0,0 +1,150 @@ +package main + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strconv" + "strings" + "testing" + "time" +) + +func TestRunAPISitesCleanupDeletesBatchAndIgnoresMissing(t *testing.T) { + var calls []string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + calls = append(calls, r.Method+" "+r.URL.RequestURI()) + switch { + case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "000"): + writeTestJSON(t, w, map[string]any{"id": apiCLIBatchBlogIDStart("cleanup-batch"), "cli_batch": "cleanup-batch"}) + case r.Method == http.MethodDelete && strings.HasSuffix(r.URL.Path, "000"): + w.WriteHeader(http.StatusNoContent) + case r.Method == http.MethodGet && strings.HasSuffix(r.URL.Path, "001"): + writeTestStatusJSON(t, w, http.StatusNotFound, map[string]string{"code": "site_not_found"}) + default: + t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String()) + } + })) + defer srv.Close() + + var stdout bytes.Buffer + start := apiCLIBatchBlogIDStart("cleanup-batch") + err := runAPISitesCleanup(context.Background(), srv.Client(), apiCLIOptions{ + baseURL: srv.URL, + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + }, apiSitesCleanupOptions{ + batch: "cleanup-batch", + count: 2, + ignoreNotFound: true, + }) + if err != nil { + t.Fatalf("runAPISitesCleanup() error = %v\nstdout=%s", err, stdout.String()) + } + var summary apiSitesCleanupSummary + if err := json.Unmarshal(stdout.Bytes(), &summary); err != nil { + t.Fatalf("unmarshal summary: %v\n%s", err, stdout.String()) + } + if summary.Batch != "cleanup-batch" || summary.Count != 2 { + t.Fatalf("summary = %#v", summary) + } + if summary.Sites[0].SiteID != start || summary.Sites[0].Status != "deleted" { + t.Fatalf("first cleanup result = %#v", summary.Sites[0]) + } + if summary.Sites[1].SiteID != start+1 || summary.Sites[1].Status != "not_found" { + t.Fatalf("second cleanup result = %#v", summary.Sites[1]) + } + wantCalls := []string{ + "GET /api/v1/sites/" + strconvInt64(start) + "?include_cli_metadata=true", + "DELETE /api/v1/sites/" + strconvInt64(start), + "GET /api/v1/sites/" + strconvInt64(start+1) + "?include_cli_metadata=true", + } + if strings.Join(calls, "\n") != strings.Join(wantCalls, "\n") { + t.Fatalf("calls:\n%s\nwant:\n%s", strings.Join(calls, "\n"), strings.Join(wantCalls, "\n")) + } +} + +func TestRunAPISitesCleanupRejectsUnmatchedBatchMarker(t *testing.T) { + var calls []string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + calls = append(calls, r.Method+" "+r.URL.RequestURI()) + switch { + case r.Method == http.MethodGet: + writeTestJSON(t, w, map[string]any{"id": 42, "cli_batch": "other-batch"}) + default: + t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String()) + } + })) + defer srv.Close() + + var stdout bytes.Buffer + err := runAPISitesCleanup(context.Background(), srv.Client(), apiCLIOptions{ + baseURL: srv.URL, + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + }, apiSitesCleanupOptions{ + batch: "cleanup-batch", + siteIDs: mustSiteIDs(t, "42"), + ignoreNotFound: true, + }) + if err == nil { + t.Fatal("runAPISitesCleanup() error = nil, want batch mismatch") + } + var summary apiSitesCleanupSummary + if err := json.Unmarshal(stdout.Bytes(), &summary); err != nil { + t.Fatalf("unmarshal summary: %v\n%s", err, stdout.String()) + } + if got := summary.Sites[0].Status; got != "skipped_unmatched_batch" { + t.Fatalf("status = %q, want skipped_unmatched_batch", got) + } + if strings.Join(calls, "\n") != "GET /api/v1/sites/42?include_cli_metadata=true" { + t.Fatalf("calls:\n%s\nwant only GET", strings.Join(calls, "\n")) + } +} + +func TestRunAPISitesCleanupDryRunTable(t *testing.T) { + var stdout bytes.Buffer + err := runAPISitesCleanup(context.Background(), nil, apiCLIOptions{ + output: "table", + out: &stdout, + errOut: ioDiscard{}, + }, apiSitesCleanupOptions{ + siteIDs: mustSiteIDs(t, "42,43"), + dryRun: true, + }) + if err != nil { + t.Fatalf("runAPISitesCleanup() error = %v", err) + } + got := stdout.String() + for _, want := range []string{ + "site_id status", + "42 would_delete", + "43 would_delete", + } { + if !strings.Contains(got, want) { + t.Fatalf("table missing %q:\n%s", want, got) + } + } +} + +func TestAPICleanupSiteIDsFromBatch(t *testing.T) { + ids, err := apiCleanupSiteIDs(apiSitesCleanupOptions{batch: "batch-a", count: 3}) + if err != nil { + t.Fatalf("apiCleanupSiteIDs() error = %v", err) + } + start := apiCLIBatchBlogIDStart("batch-a") + want := []int64{start, start + 1, start + 2} + for i := range want { + if ids[i] != want[i] { + t.Fatalf("ids[%d] = %d, want %d", i, ids[i], want[i]) + } + } +} + +func strconvInt64(v int64) string { + return strconv.FormatInt(v, 10) +} diff --git a/cmd/jetmon2/api_cli_sites_fixture.go b/cmd/jetmon2/api_cli_sites_fixture.go new file mode 100644 index 00000000..29e3829e --- /dev/null +++ b/cmd/jetmon2/api_cli_sites_fixture.go @@ -0,0 +1,6 @@ +package main + +import _ "embed" + +//go:embed testdata/api-cli-sites.json +var apiCLISiteFixture []byte diff --git a/cmd/jetmon2/api_cli_sites_simulate.go b/cmd/jetmon2/api_cli_sites_simulate.go new file mode 100644 index 00000000..d2e36ec1 --- /dev/null +++ b/cmd/jetmon2/api_cli_sites_simulate.go @@ -0,0 +1,697 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "io" + "net" + "net/http" + "net/url" + "strings" + "time" +) + +const ( + apiFixtureAuto = "auto" + apiFixtureOff = "off" + defaultAPIFixtureMonitorURL = "http://api-fixture:8091" + defaultAPIFixtureProbeURL = "http://localhost:18091/health" +) + +type apiSitesSimulateFailureOptions struct { + mode string + batch string + siteIDs apiInt64SliceFlags + count int + blogIDStart int64 + createMissing bool + trigger bool + wait time.Duration + pollInterval time.Duration + idempotencyKeyPrefix string + fixtureURL string + fixtureProbeURL string + allowUnmarkedBatch bool + expectEventState string + expectEventSeverity apiOptionalIntFlag + requireTransition bool + expectTransitionReason string +} + +type apiFailureModeDefinition struct { + Mode string + Description string + MonitorURL string + CheckKeyword *string + RedirectPolicy string + TimeoutSeconds *int + CustomHeaders map[string]string +} + +type apiSimulateFailureSummary struct { + Mode string `json:"mode"` + Batch string `json:"batch,omitempty"` + Wait string `json:"wait"` + Trigger bool `json:"trigger"` + CreateMissing bool `json:"create_missing"` + FixtureURL string `json:"fixture_url,omitempty"` + Sites []apiSimulatedSiteResult `json:"sites"` +} + +type apiSimulatedSiteResult struct { + SiteID int64 `json:"site_id"` + Action string `json:"action"` + TriggerStatus string `json:"trigger_status,omitempty"` + EventIDs []int64 `json:"event_ids,omitempty"` + EventStates []string `json:"event_states,omitempty"` + EventSeverities []int `json:"event_severities,omitempty"` + TransitionCount int `json:"transition_count"` + Site json.RawMessage `json:"site,omitempty"` + TriggerNow json.RawMessage `json:"trigger_now,omitempty"` + Events json.RawMessage `json:"events,omitempty"` + Transitions []apiSimulatedTransition `json:"transitions,omitempty"` + Note string `json:"note,omitempty"` + Error string `json:"error,omitempty"` +} + +type apiSimulatedTransition struct { + EventID int64 `json:"event_id"` + Transitions json.RawMessage `json:"transitions"` +} + +func cmdAPISitesSimulateFailure(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api sites simulate-failure", &opts) + sim := apiSitesSimulateFailureOptions{ + mode: "http-500", + count: 1, + trigger: true, + pollInterval: 2 * time.Second, + fixtureURL: envOrDefault("JETMON_API_FIXTURE_URL", apiFixtureAuto), + fixtureProbeURL: envOrDefault( + "JETMON_API_FIXTURE_PROBE_URL", + defaultAPIFixtureProbeURL, + ), + } + fs.StringVar(&sim.mode, "mode", sim.mode, "failure mode: unreachable, http-500, http-403, redirect, keyword, timeout, or tls") + fs.StringVar(&sim.batch, "batch", "", "batch label whose deterministic site ids should be mutated") + fs.Var(&sim.siteIDs, "site-id", "explicit site id to mutate (repeatable or comma-separated)") + fs.IntVar(&sim.count, "count", sim.count, "number of batch-derived site ids to mutate") + fs.Int64Var(&sim.blogIDStart, "blog-id-start", 0, "first batch blog_id; default derives from --batch") + fs.BoolVar(&sim.createMissing, "create-missing", false, "create a site if the target id does not exist") + fs.BoolVar(&sim.trigger, "trigger", sim.trigger, "call trigger-now after mutation") + fs.DurationVar(&sim.wait, "wait", 0, "poll duration for active events after mutation") + fs.DurationVar(&sim.pollInterval, "poll-interval", sim.pollInterval, "active-event poll interval when --wait is set") + fs.StringVar(&sim.idempotencyKeyPrefix, "idempotency-key-prefix", "", "prefix for per-site POST Idempotency-Key headers") + fs.StringVar(&sim.fixtureURL, "fixture-url", sim.fixtureURL, "Docker fixture monitor URL, auto, or off") + fs.StringVar(&sim.fixtureProbeURL, "fixture-probe-url", sim.fixtureProbeURL, "URL used when --fixture-url=auto") + fs.BoolVar(&sim.allowUnmarkedBatch, "allow-unmarked", false, "allow mutation of --batch targets that do not expose the matching CLI batch marker") + fs.StringVar(&sim.expectEventState, "expect-event-state", "", "require at least one active event with this state after polling") + fs.Var(&sim.expectEventSeverity, "expect-event-severity", "require at least one active event with this severity after polling") + fs.BoolVar(&sim.requireTransition, "require-transition", false, "require at least one event transition after polling") + fs.StringVar(&sim.expectTransitionReason, "expect-transition-reason", "", "require at least one transition with this reason after polling") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return errors.New("usage: jetmon2 api sites simulate-failure [flags]") + } + return runAPISitesSimulateFailure(context.Background(), nil, opts, sim) +} + +func runAPISitesSimulateFailure(ctx context.Context, client *http.Client, opts apiCLIOptions, sim apiSitesSimulateFailureOptions) error { + if opts.out == nil { + opts.out = io.Discard + } + remote, err := requireAPILocalOrAllowRemote(opts, opts.allowRemote, "api sites simulate-failure") + if err != nil { + return err + } + if remote { + if strings.TrimSpace(sim.batch) == "" { + return errors.New("api sites simulate-failure requires --batch when --allow-remote targets a non-local API") + } + if sim.allowUnmarkedBatch { + return errors.New("api sites simulate-failure cannot use --allow-unmarked with --allow-remote") + } + } + fixtureURL := apiSimulationFixtureURL(ctx, sim) + def, err := apiFailureMode(sim.mode, fixtureURL) + if err != nil { + return err + } + siteIDs, err := apiSimulationSiteIDs(sim) + if err != nil { + return err + } + if sim.pollInterval <= 0 { + return errors.New("poll-interval must be positive") + } + + summary := apiSimulateFailureSummary{ + Mode: def.Mode, + Batch: sim.batch, + Wait: sim.wait.String(), + Trigger: sim.trigger, + CreateMissing: sim.createMissing, + FixtureURL: fixtureURL, + Sites: make([]apiSimulatedSiteResult, 0, len(siteIDs)), + } + for i, siteID := range siteIDs { + result, err := runAPISiteSimulation(ctx, client, opts, sim, def, siteID, i) + summary.Sites = append(summary.Sites, result) + if err != nil { + summary.Sites[len(summary.Sites)-1].Error = err.Error() + _ = writeAPIValueOutput(opts.out, summary, opts) + return fmt.Errorf("simulate failure for site %d: %w", siteID, err) + } + } + return writeAPIValueOutput(opts.out, summary, opts) +} + +func runAPISiteSimulation(ctx context.Context, client *http.Client, opts apiCLIOptions, sim apiSitesSimulateFailureOptions, def apiFailureModeDefinition, siteID int64, index int) (apiSimulatedSiteResult, error) { + result := apiSimulatedSiteResult{SiteID: siteID} + if sim.batch != "" && !sim.allowUnmarkedBatch { + ok, exists, err := apiSiteBelongsToBatch(ctx, client, opts, siteID, sim.batch) + if err != nil { + return result, err + } + if exists && !ok { + return result, fmt.Errorf("site %d does not belong to CLI batch %q", siteID, sim.batch) + } + } + update := apiSiteUpdateRequest{ + MonitorURL: &def.MonitorURL, + CheckKeyword: def.CheckKeyword, + RedirectPolicy: &def.RedirectPolicy, + TimeoutSeconds: def.TimeoutSeconds, + } + if len(def.CustomHeaders) > 0 || sim.batch != "" { + headers := make(map[string]string, len(def.CustomHeaders)+1) + for k, v := range def.CustomHeaders { + headers[k] = v + } + if sim.batch != "" { + headers[apiCLIBatchHeader] = sim.batch + } + update.CustomHeaders = &headers + } + + site, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodPatch, fmt.Sprintf("/api/v1/sites/%d", siteID), update, "") + if err != nil { + var httpErr apiWorkflowHTTPError + if errors.As(err, &httpErr) && strings.Contains(httpErr.Status, "404") && sim.createMissing { + site, err = createMissingSimulationSite(ctx, client, opts, sim, def, siteID, index) + if err != nil { + return result, err + } + result.Action = "created" + } else { + return result, err + } + } else { + result.Action = "updated" + } + result.Site = site + + if sim.trigger { + body, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodPost, fmt.Sprintf("/api/v1/sites/%d/trigger-now", siteID), nil, apiSimulationIDKey(sim, index, "trigger-now")) + if err != nil { + return result, err + } + result.TriggerNow = body + result.TriggerStatus = apiTriggerNowStatus(body) + } else { + result.TriggerStatus = "skipped" + } + + events, transitions, err := waitForSimulationEvents(ctx, client, opts, siteID, sim) + if err != nil { + return result, err + } + result.Events = events + result.Transitions = transitions + result.EventIDs, result.EventStates, result.EventSeverities = summarizeSimulationEvents(events) + result.TransitionCount = simulationTransitionCount(transitions) + if len(transitions) == 0 { + result.Note = "no active events returned; trigger-now reports check results but regular orchestrator rounds create failure events" + } + if err := validateSimulationExpectations(result, sim); err != nil { + return result, err + } + return result, nil +} + +func createMissingSimulationSite(ctx context.Context, client *http.Client, opts apiCLIOptions, sim apiSitesSimulateFailureOptions, def apiFailureModeDefinition, siteID int64, index int) (json.RawMessage, error) { + headers := map[string]string{} + for k, v := range def.CustomHeaders { + headers[k] = v + } + if sim.batch != "" { + headers[apiCLIBatchHeader] = sim.batch + } + req := apiSiteCreateRequest{ + BlogID: siteID, + MonitorURL: def.MonitorURL, + CheckKeyword: def.CheckKeyword, + RedirectPolicy: &def.RedirectPolicy, + TimeoutSeconds: def.TimeoutSeconds, + } + if len(headers) > 0 { + req.CustomHeaders = &headers + } + return apiWorkflowRequestJSON(ctx, client, opts, http.MethodPost, "/api/v1/sites", req, apiSimulationIDKey(sim, index, "create-site")) +} + +func waitForSimulationEvents(ctx context.Context, client *http.Client, opts apiCLIOptions, siteID int64, sim apiSitesSimulateFailureOptions) (json.RawMessage, []apiSimulatedTransition, error) { + deadline := time.Now().Add(sim.wait) + for { + body, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodGet, fmt.Sprintf("/api/v1/sites/%d/events?active=true&limit=10", siteID), nil, "") + if err != nil { + return nil, nil, err + } + ids := eventIDsFromList(body) + transitions, err := querySimulationTransitions(ctx, client, opts, siteID, ids) + if err != nil { + return nil, nil, err + } + if simulationHasExpectations(sim) && sim.wait > 0 { + result := apiSimulatedSiteResult{SiteID: siteID, Events: body, Transitions: transitions} + if validateSimulationExpectations(result, sim) == nil { + return body, transitions, nil + } + } else if len(ids) > 0 || sim.wait <= 0 || time.Now().After(deadline) { + return body, transitions, nil + } + if sim.wait <= 0 || time.Now().After(deadline) { + return body, transitions, nil + } + select { + case <-ctx.Done(): + return nil, nil, ctx.Err() + case <-time.After(sim.pollInterval): + } + } +} + +func querySimulationTransitions(ctx context.Context, client *http.Client, opts apiCLIOptions, siteID int64, eventIDs []int64) ([]apiSimulatedTransition, error) { + out := make([]apiSimulatedTransition, 0, len(eventIDs)) + for _, eventID := range eventIDs { + body, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodGet, fmt.Sprintf("/api/v1/sites/%d/events/%d/transitions", siteID, eventID), nil, "") + if err != nil { + return nil, err + } + out = append(out, apiSimulatedTransition{EventID: eventID, Transitions: body}) + } + return out, nil +} + +func eventIDsFromList(body json.RawMessage) []int64 { + events, err := simulationEventsFromList(body) + if err != nil { + return nil + } + ids := make([]int64, 0, len(events)) + for _, event := range events { + if event.ID > 0 { + ids = append(ids, event.ID) + } + } + return ids +} + +type apiSimulationListedEvent struct { + ID int64 `json:"id"` + State string `json:"state"` + Severity int `json:"severity"` +} + +type apiSimulationListedTransition struct { + ID int64 `json:"id"` + EventID int64 `json:"event_id"` + Reason string `json:"reason"` + StateAfter *string `json:"state_after"` + SeverityAfter *int `json:"severity_after"` +} + +func simulationEventsFromList(body json.RawMessage) ([]apiSimulationListedEvent, error) { + var envelope struct { + Data []apiSimulationListedEvent `json:"data"` + } + if err := json.Unmarshal(body, &envelope); err != nil { + return nil, err + } + return envelope.Data, nil +} + +func simulationTransitionsFromResults(results []apiSimulatedTransition) ([]apiSimulationListedTransition, error) { + rows := []apiSimulationListedTransition{} + for _, result := range results { + var envelope struct { + Data []apiSimulationListedTransition `json:"data"` + } + if err := json.Unmarshal(result.Transitions, &envelope); err != nil { + return nil, err + } + rows = append(rows, envelope.Data...) + } + return rows, nil +} + +func apiTriggerNowStatus(body json.RawMessage) string { + var envelope struct { + Result struct { + HTTPCode int `json:"http_code"` + ErrorCode int `json:"error_code"` + Success bool `json:"success"` + } `json:"result"` + } + if err := json.Unmarshal(body, &envelope); err != nil { + return "unknown" + } + if envelope.Result.Success { + return "success" + } + if envelope.Result.HTTPCode > 0 { + return fmt.Sprintf("failed_http_%d", envelope.Result.HTTPCode) + } + if envelope.Result.ErrorCode > 0 { + return fmt.Sprintf("failed_error_%d", envelope.Result.ErrorCode) + } + return "failed" +} + +func summarizeSimulationEvents(body json.RawMessage) ([]int64, []string, []int) { + events, err := simulationEventsFromList(body) + if err != nil { + return nil, nil, nil + } + ids := make([]int64, 0, len(events)) + states := make([]string, 0, len(events)) + severities := make([]int, 0, len(events)) + for _, event := range events { + ids = append(ids, event.ID) + states = append(states, event.State) + severities = append(severities, event.Severity) + } + return ids, states, severities +} + +func simulationTransitionCount(results []apiSimulatedTransition) int { + transitions, err := simulationTransitionsFromResults(results) + if err != nil { + return 0 + } + return len(transitions) +} + +func validateSimulationExpectations(result apiSimulatedSiteResult, sim apiSitesSimulateFailureOptions) error { + if !simulationHasExpectations(sim) { + return nil + } + events, err := simulationEventsFromList(result.Events) + if err != nil { + return fmt.Errorf("decode active events response: %w", err) + } + var failures []string + if sim.expectEventState != "" && !simulationHasEventState(events, sim.expectEventState) { + failures = append(failures, fmt.Sprintf("expected active event state %q, got %s", sim.expectEventState, formatSimulationEvents(events))) + } + if sim.expectEventSeverity.set && !simulationHasEventSeverity(events, sim.expectEventSeverity.value) { + failures = append(failures, fmt.Sprintf("expected active event severity %d, got %s", sim.expectEventSeverity.value, formatSimulationEvents(events))) + } + transitions, err := simulationTransitionsFromResults(result.Transitions) + if err != nil { + return fmt.Errorf("decode transition response: %w", err) + } + if sim.requireTransition && len(transitions) == 0 { + failures = append(failures, "expected at least one transition, got none") + } + if sim.expectTransitionReason != "" && !simulationHasTransitionReason(transitions, sim.expectTransitionReason) { + failures = append(failures, fmt.Sprintf("expected transition reason %q, got %s", sim.expectTransitionReason, formatSimulationTransitions(transitions))) + } + if len(failures) > 0 { + return errors.New(strings.Join(failures, "; ")) + } + return nil +} + +func simulationHasExpectations(sim apiSitesSimulateFailureOptions) bool { + return sim.expectEventState != "" || + sim.expectEventSeverity.set || + sim.requireTransition || + sim.expectTransitionReason != "" +} + +func simulationHasEventState(events []apiSimulationListedEvent, state string) bool { + for _, event := range events { + if event.State == state { + return true + } + } + return false +} + +func simulationHasEventSeverity(events []apiSimulationListedEvent, severity int) bool { + for _, event := range events { + if event.Severity == severity { + return true + } + } + return false +} + +func simulationHasTransitionReason(transitions []apiSimulationListedTransition, reason string) bool { + for _, transition := range transitions { + if transition.Reason == reason { + return true + } + } + return false +} + +func formatSimulationEvents(events []apiSimulationListedEvent) string { + if len(events) == 0 { + return "none" + } + parts := make([]string, 0, len(events)) + for _, event := range events { + parts = append(parts, fmt.Sprintf("#%d state=%q severity=%d", event.ID, event.State, event.Severity)) + } + return strings.Join(parts, ", ") +} + +func formatSimulationTransitions(transitions []apiSimulationListedTransition) string { + if len(transitions) == 0 { + return "none" + } + parts := make([]string, 0, len(transitions)) + for _, transition := range transitions { + parts = append(parts, fmt.Sprintf("#%d event=%d reason=%q", transition.ID, transition.EventID, transition.Reason)) + } + return strings.Join(parts, ", ") +} + +func apiSimulationSiteIDs(sim apiSitesSimulateFailureOptions) ([]int64, error) { + if sim.siteIDs.set { + return sim.siteIDs.valuesOrEmpty(), nil + } + if sim.batch == "" && sim.blogIDStart == 0 { + return nil, errors.New("use --batch, --blog-id-start, or --site-id") + } + if sim.count <= 0 { + return nil, errors.New("count must be positive") + } + start := sim.blogIDStart + if start == 0 { + start = apiCLIBatchBlogIDStart(sim.batch) + } + if start <= 0 { + return nil, errors.New("blog-id-start must be positive") + } + ids := make([]int64, 0, sim.count) + for i := 0; i < sim.count; i++ { + ids = append(ids, start+int64(i)) + } + return ids, nil +} + +func apiSimulationIDKey(sim apiSitesSimulateFailureOptions, index int, suffix string) string { + if sim.idempotencyKeyPrefix == "" { + return "" + } + return fmt.Sprintf("%s-%03d-%s", sim.idempotencyKeyPrefix, index+1, suffix) +} + +func apiSimulationFixtureURL(ctx context.Context, sim apiSitesSimulateFailureOptions) string { + fixtureURL := strings.TrimSpace(sim.fixtureURL) + switch strings.ToLower(fixtureURL) { + case "", apiFixtureOff, "none", "false": + return "" + case apiFixtureAuto: + if apiFixtureAvailable(ctx, sim.fixtureProbeURL) { + return defaultAPIFixtureMonitorURL + } + return "" + default: + return strings.TrimRight(fixtureURL, "/") + } +} + +func apiFixtureAvailable(ctx context.Context, probeURL string) bool { + probeURL = strings.TrimSpace(probeURL) + if probeURL == "" { + return false + } + probeCtx, cancel := context.WithTimeout(ctx, 750*time.Millisecond) + defer cancel() + req, err := http.NewRequestWithContext(probeCtx, http.MethodGet, probeURL, nil) + if err != nil { + return false + } + resp, err := http.DefaultClient.Do(req) + if err != nil { + return false + } + defer resp.Body.Close() + return resp.StatusCode >= 200 && resp.StatusCode < 300 +} + +func apiFailureMode(mode, fixtureBase string) (apiFailureModeDefinition, error) { + if strings.TrimSpace(fixtureBase) != "" { + return apiFixtureFailureMode(mode, strings.TrimRight(fixtureBase, "/")) + } + + policyFollow := "follow" + policyFail := "fail" + missingKeyword := "jetmon-api-cli-keyword-that-should-not-exist" + timeoutShort := 2 + switch mode { + case "unreachable": + return apiFailureModeDefinition{ + Mode: mode, + Description: "reserved TEST-NET-1 address expected to be unreachable", + MonitorURL: "http://192.0.2.1/", + RedirectPolicy: policyFollow, + TimeoutSeconds: &timeoutShort, + }, nil + case "http-500": + return apiFailureModeDefinition{ + Mode: mode, + Description: "HTTP 500 response", + MonitorURL: "https://httpbin.org/status/500", + RedirectPolicy: policyFollow, + }, nil + case "http-403": + return apiFailureModeDefinition{ + Mode: mode, + Description: "HTTP 403 response", + MonitorURL: "https://httpbin.org/status/403", + RedirectPolicy: policyFollow, + }, nil + case "redirect": + return apiFailureModeDefinition{ + Mode: mode, + Description: "redirect response with fail policy", + MonitorURL: "https://httpbin.org/redirect-to?url=https%3A%2F%2Fexample.com%2F", + RedirectPolicy: policyFail, + }, nil + case "keyword": + return apiFailureModeDefinition{ + Mode: mode, + Description: "keyword mismatch against example.com", + MonitorURL: "https://example.com/", + CheckKeyword: &missingKeyword, + RedirectPolicy: policyFollow, + }, nil + case "timeout": + return apiFailureModeDefinition{ + Mode: mode, + Description: "slow response with short timeout", + MonitorURL: "https://httpbin.org/delay/10", + RedirectPolicy: policyFollow, + TimeoutSeconds: &timeoutShort, + }, nil + case "tls": + return apiFailureModeDefinition{ + Mode: mode, + Description: "expired TLS certificate", + MonitorURL: "https://expired.badssl.com/", + RedirectPolicy: policyFollow, + }, nil + default: + return apiFailureModeDefinition{}, errors.New("mode must be one of: unreachable, http-500, http-403, redirect, keyword, timeout, tls") + } +} + +func apiFixtureFailureMode(mode, fixtureBase string) (apiFailureModeDefinition, error) { + policyFollow := "follow" + policyFail := "fail" + missingKeyword := "jetmon-api-cli-keyword-that-should-not-exist" + timeoutShort := 1 + switch mode { + case "unreachable": + return apiFailureMode(mode, "") + case "http-500": + return apiFailureModeDefinition{ + Mode: mode, + Description: "Docker fixture HTTP 500 response", + MonitorURL: fixtureBase + "/status/500", + RedirectPolicy: policyFollow, + }, nil + case "http-403": + return apiFailureModeDefinition{ + Mode: mode, + Description: "Docker fixture HTTP 403 response", + MonitorURL: fixtureBase + "/status/403", + RedirectPolicy: policyFollow, + }, nil + case "redirect": + return apiFailureModeDefinition{ + Mode: mode, + Description: "Docker fixture redirect response with fail policy", + MonitorURL: fixtureBase + "/redirect", + RedirectPolicy: policyFail, + }, nil + case "keyword": + return apiFailureModeDefinition{ + Mode: mode, + Description: "Docker fixture keyword mismatch", + MonitorURL: fixtureBase + "/keyword", + CheckKeyword: &missingKeyword, + RedirectPolicy: policyFollow, + }, nil + case "timeout": + return apiFailureModeDefinition{ + Mode: mode, + Description: "Docker fixture slow response with short timeout", + MonitorURL: fixtureBase + "/slow?delay=5s", + RedirectPolicy: policyFollow, + TimeoutSeconds: &timeoutShort, + }, nil + case "tls": + return apiFailureModeDefinition{ + Mode: mode, + Description: "Docker fixture self-signed TLS certificate", + MonitorURL: apiFixtureTLSBase(fixtureBase) + "/tls", + RedirectPolicy: policyFollow, + }, nil + default: + return apiFailureModeDefinition{}, errors.New("mode must be one of: unreachable, http-500, http-403, redirect, keyword, timeout, tls") + } +} + +func apiFixtureTLSBase(fixtureBase string) string { + u, err := url.Parse(fixtureBase) + if err != nil || u.Host == "" { + return strings.TrimRight(fixtureBase, "/") + } + u.Scheme = "https" + host, port, err := net.SplitHostPort(u.Host) + if err == nil && port == "8091" { + u.Host = net.JoinHostPort(host, "8443") + } + return strings.TrimRight(u.String(), "/") +} diff --git a/cmd/jetmon2/api_cli_sites_simulate_test.go b/cmd/jetmon2/api_cli_sites_simulate_test.go new file mode 100644 index 00000000..71c7f833 --- /dev/null +++ b/cmd/jetmon2/api_cli_sites_simulate_test.go @@ -0,0 +1,376 @@ +package main + +import ( + "bytes" + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +func TestRunAPISitesSimulateFailureUpdatesAndReportsEvents(t *testing.T) { + var severity apiOptionalIntFlag + setTestFlag(t, &severity, "3") + var calls []string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + calls = append(calls, r.Method+" "+r.URL.String()) + switch { + case r.Method == http.MethodPatch && r.URL.Path == "/api/v1/sites/42": + var body map[string]any + decodeTestJSON(t, r, &body) + if body["monitor_url"] != "https://httpbin.org/status/500" { + t.Fatalf("monitor_url = %#v, want http-500 URL", body["monitor_url"]) + } + writeTestJSON(t, w, map[string]any{"id": 42, "monitor_url": body["monitor_url"]}) + case r.Method == http.MethodPost && r.URL.Path == "/api/v1/sites/42/trigger-now": + writeTestJSON(t, w, map[string]any{"result": map[string]any{"success": false, "http_code": 500}}) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/sites/42/events": + writeTestJSON(t, w, map[string]any{ + "data": []any{map[string]any{"id": 99, "state": "Seems Down", "severity": 3}}, + "page": map[string]any{"limit": 10}, + }) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/sites/42/events/99/transitions": + writeTestJSON(t, w, map[string]any{ + "data": []any{map[string]any{ + "id": 1, + "event_id": 99, + "severity_after": 3, + "state_after": "Seems Down", + "reason": "opened", + }}, + }) + default: + t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String()) + } + })) + defer srv.Close() + + var stdout bytes.Buffer + err := runAPISitesSimulateFailure(context.Background(), srv.Client(), apiCLIOptions{ + baseURL: srv.URL, + token: "token-123", + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + }, apiSitesSimulateFailureOptions{ + mode: "http-500", + siteIDs: mustSiteIDs(t, "42"), + trigger: true, + pollInterval: time.Millisecond, + expectEventState: "Seems Down", + expectEventSeverity: severity, + requireTransition: true, + expectTransitionReason: "opened", + }) + if err != nil { + t.Fatalf("runAPISitesSimulateFailure() error = %v\nstdout=%s", err, stdout.String()) + } + var summary apiSimulateFailureSummary + if err := json.Unmarshal(stdout.Bytes(), &summary); err != nil { + t.Fatalf("unmarshal summary: %v\n%s", err, stdout.String()) + } + if summary.Mode != "http-500" || len(summary.Sites) != 1 { + t.Fatalf("summary = %#v", summary) + } + if summary.Sites[0].Action != "updated" { + t.Fatalf("action = %q, want updated", summary.Sites[0].Action) + } + if summary.Sites[0].TriggerStatus != "failed_http_500" { + t.Fatalf("trigger status = %q, want failed_http_500", summary.Sites[0].TriggerStatus) + } + if got := summary.Sites[0].EventIDs; len(got) != 1 || got[0] != 99 { + t.Fatalf("event ids = %#v, want [99]", got) + } + if got := summary.Sites[0].EventStates; len(got) != 1 || got[0] != "Seems Down" { + t.Fatalf("event states = %#v, want [Seems Down]", got) + } + if got := summary.Sites[0].EventSeverities; len(got) != 1 || got[0] != 3 { + t.Fatalf("event severities = %#v, want [3]", got) + } + if summary.Sites[0].TransitionCount != 1 { + t.Fatalf("transition count = %d, want 1", summary.Sites[0].TransitionCount) + } + if len(summary.Sites[0].Transitions) != 1 || summary.Sites[0].Transitions[0].EventID != 99 { + t.Fatalf("transitions = %#v, want event 99", summary.Sites[0].Transitions) + } + wantCalls := []string{ + "PATCH /api/v1/sites/42", + "POST /api/v1/sites/42/trigger-now", + "GET /api/v1/sites/42/events?active=true&limit=10", + "GET /api/v1/sites/42/events/99/transitions", + } + if strings.Join(calls, "\n") != strings.Join(wantCalls, "\n") { + t.Fatalf("calls:\n%s\nwant:\n%s", strings.Join(calls, "\n"), strings.Join(wantCalls, "\n")) + } +} + +func TestRunAPISitesSimulateFailurePollsUntilAssertionsMatch(t *testing.T) { + var eventPolls int + var transitionPolls int + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodPatch && r.URL.Path == "/api/v1/sites/42": + writeTestJSON(t, w, map[string]any{"id": 42}) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/sites/42/events": + eventPolls++ + state := "Seems Down" + severity := 3 + if eventPolls > 1 { + state = "Down" + severity = 4 + } + writeTestJSON(t, w, map[string]any{ + "data": []any{map[string]any{"id": 99, "state": state, "severity": severity}}, + "page": map[string]any{"limit": 10}, + }) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/sites/42/events/99/transitions": + transitionPolls++ + reason := "opened" + if transitionPolls > 1 { + reason = "verifier_confirmed" + } + writeTestJSON(t, w, map[string]any{ + "data": []any{map[string]any{"id": transitionPolls, "event_id": 99, "reason": reason}}, + }) + default: + t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String()) + } + })) + defer srv.Close() + + var stdout bytes.Buffer + err := runAPISitesSimulateFailure(context.Background(), srv.Client(), apiCLIOptions{ + baseURL: srv.URL, + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + }, apiSitesSimulateFailureOptions{ + mode: "http-500", + siteIDs: mustSiteIDs(t, "42"), + trigger: false, + wait: 100 * time.Millisecond, + pollInterval: time.Millisecond, + expectEventState: "Down", + expectTransitionReason: "verifier_confirmed", + }) + if err != nil { + t.Fatalf("runAPISitesSimulateFailure() error = %v\nstdout=%s", err, stdout.String()) + } + if eventPolls < 2 || transitionPolls < 2 { + t.Fatalf("eventPolls=%d transitionPolls=%d, want at least 2 each", eventPolls, transitionPolls) + } +} + +func TestRunAPISitesSimulateFailureFailsWhenAssertionsDoNotMatch(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodPatch && r.URL.Path == "/api/v1/sites/42": + writeTestJSON(t, w, map[string]any{"id": 42}) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/sites/42/events": + writeTestJSON(t, w, map[string]any{"data": []any{}, "page": map[string]any{"limit": 10}}) + default: + t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String()) + } + })) + defer srv.Close() + + var stdout bytes.Buffer + err := runAPISitesSimulateFailure(context.Background(), srv.Client(), apiCLIOptions{ + baseURL: srv.URL, + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + }, apiSitesSimulateFailureOptions{ + mode: "http-500", + siteIDs: mustSiteIDs(t, "42"), + trigger: false, + pollInterval: time.Millisecond, + expectEventState: "Seems Down", + requireTransition: true, + }) + if err == nil { + t.Fatalf("runAPISitesSimulateFailure() error = nil\nstdout=%s", stdout.String()) + } + if !strings.Contains(err.Error(), `expected active event state "Seems Down"`) { + t.Fatalf("error = %v, want event-state assertion failure", err) + } + if !strings.Contains(stdout.String(), "expected at least one transition") { + t.Fatalf("stdout = %s, want transition assertion failure", stdout.String()) + } +} + +func TestRunAPISitesSimulateFailureCanCreateMissing(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodPatch && r.URL.Path == "/api/v1/sites/42": + writeTestStatusJSON(t, w, http.StatusNotFound, map[string]string{"code": "site_not_found"}) + case r.Method == http.MethodPost && r.URL.Path == "/api/v1/sites": + var body map[string]any + decodeTestJSON(t, r, &body) + if body["blog_id"] != float64(42) { + t.Fatalf("blog_id = %#v, want 42", body["blog_id"]) + } + writeTestStatusJSON(t, w, http.StatusCreated, map[string]any{"id": 42}) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/sites/42/events": + writeTestJSON(t, w, map[string]any{"data": []any{}, "page": map[string]any{"limit": 10}}) + default: + t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String()) + } + })) + defer srv.Close() + + var stdout bytes.Buffer + err := runAPISitesSimulateFailure(context.Background(), srv.Client(), apiCLIOptions{ + baseURL: srv.URL, + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + }, apiSitesSimulateFailureOptions{ + mode: "keyword", + siteIDs: mustSiteIDs(t, "42"), + createMissing: true, + trigger: false, + pollInterval: time.Millisecond, + }) + if err != nil { + t.Fatalf("runAPISitesSimulateFailure() error = %v\nstdout=%s", err, stdout.String()) + } + var summary apiSimulateFailureSummary + if err := json.Unmarshal(stdout.Bytes(), &summary); err != nil { + t.Fatalf("unmarshal summary: %v\n%s", err, stdout.String()) + } + if summary.Sites[0].Action != "created" { + t.Fatalf("action = %q, want created", summary.Sites[0].Action) + } +} + +func TestRunAPISitesSimulateFailureRejectsUnmatchedBatchMarker(t *testing.T) { + start := apiCLIBatchBlogIDStart("simulation-batch") + var calls []string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + calls = append(calls, r.Method+" "+r.URL.RequestURI()) + switch { + case r.Method == http.MethodGet && + r.URL.Path == "/api/v1/sites/"+strconvInt64(start) && + r.URL.Query().Get("include_cli_metadata") == "true": + writeTestJSON(t, w, map[string]any{"id": start, "cli_batch": "other-batch"}) + default: + t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String()) + } + })) + defer srv.Close() + + var stdout bytes.Buffer + err := runAPISitesSimulateFailure(context.Background(), srv.Client(), apiCLIOptions{ + baseURL: srv.URL, + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + }, apiSitesSimulateFailureOptions{ + mode: "http-500", + batch: "simulation-batch", + count: 1, + trigger: false, + pollInterval: time.Millisecond, + }) + if err == nil { + t.Fatal("runAPISitesSimulateFailure() error = nil, want batch mismatch") + } + if !strings.Contains(err.Error(), `does not belong to CLI batch "simulation-batch"`) { + t.Fatalf("error = %v, want batch mismatch", err) + } + if strings.Join(calls, "\n") != "GET /api/v1/sites/"+strconvInt64(start)+"?include_cli_metadata=true" { + t.Fatalf("calls:\n%s\nwant only GET", strings.Join(calls, "\n")) + } +} + +func TestAPISimulationSiteIDsFromBatch(t *testing.T) { + ids, err := apiSimulationSiteIDs(apiSitesSimulateFailureOptions{batch: "batch-a", count: 3}) + if err != nil { + t.Fatalf("apiSimulationSiteIDs() error = %v", err) + } + start := apiCLIBatchBlogIDStart("batch-a") + want := []int64{start, start + 1, start + 2} + for i := range want { + if ids[i] != want[i] { + t.Fatalf("ids[%d] = %d, want %d", i, ids[i], want[i]) + } + } +} + +func TestAPIFailureModesCoverRoadmapTargets(t *testing.T) { + for _, mode := range []string{"unreachable", "http-500", "http-403", "redirect", "keyword", "timeout", "tls"} { + t.Run(mode, func(t *testing.T) { + def, err := apiFailureMode(mode, "") + if err != nil { + t.Fatalf("apiFailureMode(%q) error = %v", mode, err) + } + if def.MonitorURL == "" || def.RedirectPolicy == "" { + t.Fatalf("definition = %#v, want URL and redirect policy", def) + } + }) + } +} + +func TestAPIFailureModesPreferFixtureWhenConfigured(t *testing.T) { + tests := []struct { + mode string + url string + }{ + {mode: "http-500", url: "http://api-fixture:8091/status/500"}, + {mode: "http-403", url: "http://api-fixture:8091/status/403"}, + {mode: "redirect", url: "http://api-fixture:8091/redirect"}, + {mode: "keyword", url: "http://api-fixture:8091/keyword"}, + {mode: "timeout", url: "http://api-fixture:8091/slow?delay=5s"}, + {mode: "tls", url: "https://api-fixture:8443/tls"}, + } + for _, tt := range tests { + t.Run(tt.mode, func(t *testing.T) { + def, err := apiFailureMode(tt.mode, "http://api-fixture:8091") + if err != nil { + t.Fatalf("apiFailureMode(%q) error = %v", tt.mode, err) + } + if def.MonitorURL != tt.url { + t.Fatalf("MonitorURL = %q, want %q", def.MonitorURL, tt.url) + } + }) + } +} + +func TestAPISimulationFixtureURLAutoDetection(t *testing.T) { + fixture := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/health" { + t.Fatalf("probe path = %q, want /health", r.URL.Path) + } + w.WriteHeader(http.StatusOK) + })) + defer fixture.Close() + + got := apiSimulationFixtureURL(context.Background(), apiSitesSimulateFailureOptions{ + fixtureURL: apiFixtureAuto, + fixtureProbeURL: fixture.URL + "/health", + }) + if got != defaultAPIFixtureMonitorURL { + t.Fatalf("fixture URL = %q, want default Docker monitor URL", got) + } + + got = apiSimulationFixtureURL(context.Background(), apiSitesSimulateFailureOptions{ + fixtureURL: apiFixtureAuto, + fixtureProbeURL: "http://127.0.0.1:1/health", + }) + if got != "" { + t.Fatalf("fixture URL = %q, want fallback to public endpoints", got) + } +} + +func mustSiteIDs(t *testing.T, raw string) apiInt64SliceFlags { + t.Helper() + var ids apiInt64SliceFlags + if err := ids.Set(raw); err != nil { + t.Fatalf("set site ids: %v", err) + } + return ids +} diff --git a/cmd/jetmon2/api_cli_sites_test.go b/cmd/jetmon2/api_cli_sites_test.go new file mode 100644 index 00000000..530d9c2f --- /dev/null +++ b/cmd/jetmon2/api_cli_sites_test.go @@ -0,0 +1,181 @@ +package main + +import ( + "encoding/json" + "net/url" + "testing" +) + +func TestAPISitesListPath(t *testing.T) { + got, err := apiSitesListPath(apiSitesListFilters{ + cursor: "cur-1", + limit: 25, + stateIn: "Down,Seems Down", + severityGTE: 3, + monitorActive: "1", + q: "example.com", + }) + if err != nil { + t.Fatalf("apiSitesListPath() error = %v", err) + } + u, err := url.Parse(got) + if err != nil { + t.Fatalf("parse path: %v", err) + } + if u.Path != "/api/v1/sites" { + t.Fatalf("path = %q, want /api/v1/sites", u.Path) + } + q := u.Query() + for key, want := range map[string]string{ + "cursor": "cur-1", + "limit": "25", + "state__in": "Down,Seems Down", + "severity__gte": "3", + "monitor_active": "true", + "q": "example.com", + } { + if got := q.Get(key); got != want { + t.Fatalf("query %s = %q, want %q in %s", key, got, want, got) + } + } +} + +func TestAPISitesListPathRejectsAmbiguousStateFilter(t *testing.T) { + _, err := apiSitesListPath(apiSitesListFilters{state: "Down", stateIn: "Up,Down"}) + if err == nil { + t.Fatal("apiSitesListPath() error = nil, want error") + } +} + +func TestAPISiteResourcePath(t *testing.T) { + got, err := apiSiteResourcePath("42", "trigger-now") + if err != nil { + t.Fatalf("apiSiteResourcePath() error = %v", err) + } + if got != "/api/v1/sites/42/trigger-now" { + t.Fatalf("path = %q, want trigger-now path", got) + } + if _, err := apiSiteResourcePath("0", ""); err == nil { + t.Fatal("apiSiteResourcePath() error = nil, want invalid id error") + } +} + +func TestMarshalAPISiteCreateBody(t *testing.T) { + var active apiOptionalBoolFlag + setTestFlag(t, &active, "false") + var bucket apiOptionalIntFlag + setTestFlag(t, &bucket, "7") + var redirect apiOptionalStringFlag + setTestFlag(t, &redirect, "alert") + var headers apiStringMapFlags + setTestFlag(t, &headers, "X-Jetmon-Test: yes") + var forbiddenKeywords apiStringSliceFlags + setTestFlag(t, &forbiddenKeywords, "metrics.evil-cdn.example/collect.js") + setTestFlag(t, &forbiddenKeywords, "buy cheap viagra") + + body, err := marshalAPISiteCreateBody(apiSiteCreateOptions{ + blogID: 12345, + monitorURL: "https://example.com", + monitorActive: active, + bucketNo: bucket, + forbiddenKeywords: forbiddenKeywords, + redirectPolicy: redirect, + customHeaders: headers, + }) + if err != nil { + t.Fatalf("marshalAPISiteCreateBody() error = %v", err) + } + var got map[string]any + if err := json.Unmarshal(body, &got); err != nil { + t.Fatalf("unmarshal body: %v", err) + } + if got["blog_id"] != float64(12345) { + t.Fatalf("blog_id = %#v, want 12345", got["blog_id"]) + } + if got["monitor_url"] != "https://example.com" { + t.Fatalf("monitor_url = %#v", got["monitor_url"]) + } + if got["monitor_active"] != false { + t.Fatalf("monitor_active = %#v, want false", got["monitor_active"]) + } + if got["bucket_no"] != float64(7) { + t.Fatalf("bucket_no = %#v, want 7", got["bucket_no"]) + } + if got["redirect_policy"] != "alert" { + t.Fatalf("redirect_policy = %#v, want alert", got["redirect_policy"]) + } + assertStringArray(t, got["forbidden_keywords"], []string{"metrics.evil-cdn.example/collect.js", "buy cheap viagra"}) + custom, ok := got["custom_headers"].(map[string]any) + if !ok { + t.Fatalf("custom_headers = %#v, want object", got["custom_headers"]) + } + if custom["X-Jetmon-Test"] != "yes" { + t.Fatalf("custom header = %#v, want yes", custom["X-Jetmon-Test"]) + } +} + +func TestMarshalAPISiteUpdateBodySupportsClears(t *testing.T) { + var keyword apiOptionalStringFlag + setTestFlag(t, &keyword, "") + var maintenanceEnd apiOptionalStringFlag + setTestFlag(t, &maintenanceEnd, "") + + body, err := marshalAPISiteUpdateBody(apiSiteUpdateOptions{ + checkKeyword: keyword, + clearCustomHeaders: true, + clearForbiddenKeywords: true, + maintenanceEnd: maintenanceEnd, + }) + if err != nil { + t.Fatalf("marshalAPISiteUpdateBody() error = %v", err) + } + var got map[string]any + if err := json.Unmarshal(body, &got); err != nil { + t.Fatalf("unmarshal body: %v", err) + } + if got["check_keyword"] != "" { + t.Fatalf("check_keyword = %#v, want empty string", got["check_keyword"]) + } + if got["maintenance_end"] != "" { + t.Fatalf("maintenance_end = %#v, want empty string", got["maintenance_end"]) + } + custom, ok := got["custom_headers"].(map[string]any) + if !ok { + t.Fatalf("custom_headers = %#v, want object", got["custom_headers"]) + } + if len(custom) != 0 { + t.Fatalf("custom_headers = %#v, want empty object", custom) + } + assertStringArray(t, got["forbidden_keywords"], []string{}) +} + +func TestMarshalAPISiteUpdateBodyRejectsCustomHeaderConflict(t *testing.T) { + var headers apiStringMapFlags + setTestFlag(t, &headers, "X-Test: yes") + _, err := marshalAPISiteUpdateBody(apiSiteUpdateOptions{ + customHeaders: headers, + clearCustomHeaders: true, + }) + if err == nil { + t.Fatal("marshalAPISiteUpdateBody() error = nil, want conflict error") + } +} + +func TestMarshalAPISiteUpdateBodyRejectsForbiddenKeywordConflict(t *testing.T) { + var forbiddenKeywords apiStringSliceFlags + setTestFlag(t, &forbiddenKeywords, "bad") + _, err := marshalAPISiteUpdateBody(apiSiteUpdateOptions{ + forbiddenKeywords: forbiddenKeywords, + clearForbiddenKeywords: true, + }) + if err == nil { + t.Fatal("marshalAPISiteUpdateBody() error = nil, want conflict error") + } +} + +func setTestFlag(t *testing.T, v interface{ Set(string) error }, raw string) { + t.Helper() + if err := v.Set(raw); err != nil { + t.Fatalf("Set(%q) error = %v", raw, err) + } +} diff --git a/cmd/jetmon2/api_cli_test.go b/cmd/jetmon2/api_cli_test.go new file mode 100644 index 00000000..6b1ae5e2 --- /dev/null +++ b/cmd/jetmon2/api_cli_test.go @@ -0,0 +1,496 @@ +package main + +import ( + "bytes" + "context" + "errors" + "flag" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +func TestAPIRequestURL(t *testing.T) { + tests := []struct { + name string + baseURL string + target string + want string + wantErr bool + }{ + { + name: "absolute path", + baseURL: "http://localhost:8090", + target: "/api/v1/health", + want: "http://localhost:8090/api/v1/health", + }, + { + name: "relative path", + baseURL: "http://localhost:8090/", + target: "api/v1/me", + want: "http://localhost:8090/api/v1/me", + }, + { + name: "absolute url", + baseURL: "http://localhost:8090", + target: "http://127.0.0.1:9000/api/v1/health", + want: "http://127.0.0.1:9000/api/v1/health", + }, + { + name: "base requires host", + baseURL: "localhost:8090", + target: "/api/v1/health", + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := apiRequestURL(tt.baseURL, tt.target) + if tt.wantErr { + if err == nil { + t.Fatal("apiRequestURL() error = nil, want error") + } + return + } + if err != nil { + t.Fatalf("apiRequestURL() error = %v", err) + } + if got != tt.want { + t.Fatalf("apiRequestURL() = %q, want %q", got, tt.want) + } + }) + } +} + +func TestExecuteAPIRequestSendsAuthAndVerboseHeaders(t *testing.T) { + var sawAuth, sawIDKey bool + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if got := r.Header.Get("Authorization"); got == "Bearer token-123" { + sawAuth = true + } + if got := r.Header.Get("Idempotency-Key"); got == "idem-1" { + sawIDKey = true + } + w.Header().Set("X-Test-Response", "yes") + w.Header().Set("Set-Cookie", "session=secret-cookie") + w.Header().Set("X-Api-Key", "response-api-key") + w.WriteHeader(http.StatusCreated) + _, _ = w.Write([]byte(`{"ok":true}`)) + })) + defer srv.Close() + + var stdout, stderr bytes.Buffer + opts := apiCLIOptions{ + baseURL: srv.URL, + token: "token-123", + idempotencyKey: "idem-1", + verbose: true, + pretty: true, + timeout: time.Second, + out: &stdout, + errOut: &stderr, + } + if err := executeAPIRequest(context.Background(), srv.Client(), opts, http.MethodPost, "/api/v1/sites/42/trigger-now", []byte(`{}`)); err != nil { + t.Fatalf("executeAPIRequest() error = %v", err) + } + if !sawAuth { + t.Fatal("Authorization header was not sent") + } + if !sawIDKey { + t.Fatal("Idempotency-Key header was not sent") + } + if got := stdout.String(); !strings.Contains(got, "{\n \"ok\": true\n}") { + t.Fatalf("stdout = %q, want pretty JSON body", got) + } + errOut := stderr.String() + for _, want := range []string{ + "> POST /api/v1/sites/42/trigger-now HTTP/1.1", + "> Authorization: [redacted]", + "> Idempotency-Key: [redacted]", + "< HTTP/1.1 201 Created", + "< Set-Cookie: [redacted]", + "< X-Api-Key: [redacted]", + "< X-Test-Response: yes", + } { + if !strings.Contains(errOut, want) { + t.Fatalf("stderr missing %q:\n%s", want, errOut) + } + } + for _, secret := range []string{"token-123", "idem-1", "secret-cookie", "response-api-key"} { + if strings.Contains(errOut, secret) { + t.Fatalf("stderr leaked %q:\n%s", secret, errOut) + } + } +} + +func TestExecuteAPIRequestSkipsAutomaticAuthForDifferentOrigin(t *testing.T) { + base := httptest.NewServer(http.NotFoundHandler()) + defer base.Close() + + var sawAuth, sawIDKey bool + target := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + sawAuth = r.Header.Get("Authorization") != "" + sawIDKey = r.Header.Get("Idempotency-Key") != "" + _, _ = w.Write([]byte(`{"ok":true}`)) + })) + defer target.Close() + + var stdout bytes.Buffer + opts := apiCLIOptions{ + baseURL: base.URL, + token: "token-123", + idempotencyKey: "idem-1", + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + } + if err := executeAPIRequest(context.Background(), target.Client(), opts, http.MethodPost, target.URL+"/api/v1/sites", []byte(`{}`)); err != nil { + t.Fatalf("executeAPIRequest() error = %v", err) + } + if sawAuth { + t.Fatal("Authorization header was sent to a different origin") + } + if sawIDKey { + t.Fatal("Idempotency-Key header was sent to a different origin") + } +} + +func TestExecuteAPIRequestAnyOriginPolicySendsAutomaticAuth(t *testing.T) { + base := httptest.NewServer(http.NotFoundHandler()) + defer base.Close() + + var sawAuth, sawIDKey bool + target := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + sawAuth = r.Header.Get("Authorization") == "Bearer token-123" + sawIDKey = r.Header.Get("Idempotency-Key") == "idem-1" + _, _ = w.Write([]byte(`{"ok":true}`)) + })) + defer target.Close() + + var stdout bytes.Buffer + opts := apiCLIOptions{ + baseURL: base.URL, + token: "token-123", + authPolicy: "any-origin", + idempotencyKey: "idem-1", + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + } + if err := executeAPIRequest(context.Background(), target.Client(), opts, http.MethodPost, target.URL+"/api/v1/sites", []byte(`{}`)); err != nil { + t.Fatalf("executeAPIRequest() error = %v", err) + } + if !sawAuth { + t.Fatal("Authorization header was not sent with any-origin policy") + } + if !sawIDKey { + t.Fatal("Idempotency-Key header was not sent with any-origin policy") + } +} + +func TestExecuteAPIRequestRejectsInvalidAuthPolicy(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Fatal("server should not be called") + })) + defer srv.Close() + + err := executeAPIRequest(context.Background(), srv.Client(), apiCLIOptions{ + baseURL: srv.URL, + authPolicy: "sometimes", + timeout: time.Second, + out: ioDiscard{}, + errOut: ioDiscard{}, + }, http.MethodGet, "/api/v1/health", nil) + if err == nil { + t.Fatal("executeAPIRequest() error = nil, want invalid auth policy") + } + if !strings.Contains(err.Error(), "invalid auth policy") { + t.Fatalf("error = %v, want invalid auth policy", err) + } +} + +func TestExecuteAPIRequestReturnsErrorForHTTPFailureAfterWritingBody(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusUnauthorized) + _, _ = w.Write([]byte(`{"error":"missing token"}`)) + })) + defer srv.Close() + + var stdout bytes.Buffer + opts := apiCLIOptions{ + baseURL: srv.URL, + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + } + err := executeAPIRequest(context.Background(), srv.Client(), opts, http.MethodGet, "/api/v1/me", nil) + if err == nil { + t.Fatal("executeAPIRequest() error = nil, want error") + } + if got := stdout.String(); !strings.Contains(got, `"missing token"`) { + t.Fatalf("stdout = %q, want error body", got) + } +} + +func TestAPIFlagUsageUsesLongDashesAndHidesTokenDefault(t *testing.T) { + var stderr bytes.Buffer + opts := apiCLIOptions{ + baseURL: "http://localhost:8090", + token: "token-should-not-print", + timeout: 10 * time.Second, + errOut: &stderr, + } + fs := newAPIFlagSet("api health", &opts) + fs.Usage() + + got := stderr.String() + for _, want := range []string{ + "Usage of api health:", + "--allow-remote", + "--auth-policy string", + "--base-url string", + "--header value", + "--output string", + "--pretty", + "--timeout duration", + "--token string", + "-v", + "--verbose", + `API base URL (default "http://localhost:8090")`, + `request timeout (default 10s)`, + } { + if !strings.Contains(got, want) { + t.Fatalf("usage missing %q:\n%s", want, got) + } + } + for _, unwanted := range []string{ + " -base-url", + " -allow-remote", + " -header", + " -output", + " -pretty", + " -timeout", + " -token", + " -verbose", + "token-should-not-print", + } { + if strings.Contains(got, unwanted) { + t.Fatalf("usage contains %q:\n%s", unwanted, got) + } + } +} + +func TestAPIHelpReturnsFlagErrHelp(t *testing.T) { + var stderr bytes.Buffer + opts := apiCLIOptions{baseURL: "http://localhost:8090", timeout: 10 * time.Second, errOut: &stderr} + fs := newAPIFlagSet("api health", &opts) + err := parseAPIFlags(fs, []string{"--help"}) + if !errors.Is(err, flag.ErrHelp) { + t.Fatalf("Parse(--help) error = %v, want flag.ErrHelp", err) + } + if got := stderr.String(); !strings.Contains(got, "--base-url string") { + t.Fatalf("usage = %q, want long-dash flag output", got) + } +} + +func TestParseAPIFlagsAllowsFlagsAfterPositionals(t *testing.T) { + var stderr bytes.Buffer + opts := apiCLIOptions{baseURL: "http://localhost:8090", timeout: 10 * time.Second, errOut: &stderr} + fs := newAPIFlagSet("api sites get", &opts) + + err := parseAPIFlags(fs, []string{"12345", "--pretty", "--output", "table", "--header", "X-Test: yes"}) + if err != nil { + t.Fatalf("parseAPIFlags() error = %v", err) + } + if !opts.pretty { + t.Fatal("pretty = false, want true") + } + if opts.output != "table" { + t.Fatalf("output = %q, want table", opts.output) + } + if got := opts.headers; len(got) != 1 || got[0] != "X-Test: yes" { + t.Fatalf("headers = %#v, want X-Test header", got) + } + if got := fs.Args(); len(got) != 1 || got[0] != "12345" { + t.Fatalf("args = %#v, want [12345]", got) + } +} + +func TestParseAPIFlagsPreservesPositionalsAfterDoubleDash(t *testing.T) { + var stderr bytes.Buffer + opts := apiCLIOptions{baseURL: "http://localhost:8090", timeout: 10 * time.Second, errOut: &stderr} + fs := newAPIFlagSet("api request", &opts) + + err := parseAPIFlags(fs, []string{"GET", "--", "--not-a-flag"}) + if err != nil { + t.Fatalf("parseAPIFlags() error = %v", err) + } + if got := fs.Args(); len(got) != 2 || got[0] != "GET" || got[1] != "--not-a-flag" { + t.Fatalf("args = %#v, want GET and literal --not-a-flag", got) + } +} + +func TestNewAPIFlagSetHonorsPresetOutputDefault(t *testing.T) { + var stderr bytes.Buffer + opts := apiCLIOptions{output: "table", errOut: &stderr} + fs := newAPIFlagSet("api commands", &opts) + if err := parseAPIFlags(fs, nil); err != nil { + t.Fatalf("parseAPIFlags() error = %v", err) + } + if opts.output != "table" { + t.Fatalf("output = %q, want table", opts.output) + } +} + +func TestWriteAPICommandsTable(t *testing.T) { + var out bytes.Buffer + err := writeAPICommands(apiCLIOptions{output: "table", out: &out}) + if err != nil { + t.Fatalf("writeAPICommands() error = %v", err) + } + got := out.String() + for _, want := range []string{ + "command description", + "sites simulate-failure", + "mutate test sites into known failure modes", + "commands", + "list API CLI commands and examples", + } { + if !strings.Contains(got, want) { + t.Fatalf("commands table missing %q:\n%s", want, got) + } + } +} + +func TestWriteAPIResponseTableForSiteList(t *testing.T) { + body := []byte(`{ + "data": [ + {"id": 42, "monitor_url": "https://example.com", "monitor_active": true, "current_state": "Up", "current_severity": 0}, + {"id": 43, "monitor_url": "https://wordpress.com", "monitor_active": false, "current_state": "Paused", "current_severity": 0} + ], + "page": {"limit": 50} + }`) + var out bytes.Buffer + if err := writeAPIResponseTable(&out, body); err != nil { + t.Fatalf("writeAPIResponseTable() error = %v", err) + } + got := out.String() + for _, want := range []string{ + "id monitor_url monitor_active current_state current_severity", + "42 https://example.com true Up 0", + "43 https://wordpress.com false Paused 0", + } { + if !strings.Contains(got, want) { + t.Fatalf("table missing %q:\n%s", want, got) + } + } +} + +func TestWriteAPIResponseTableUsesNestedWorkflowRows(t *testing.T) { + body := []byte(`{ + "mode": "http-500", + "sites": [ + {"site_id": 42, "action": "updated", "note": "no active events returned"}, + {"site_id": 43, "action": "created", "error": "trigger failed"} + ] + }`) + var out bytes.Buffer + if err := writeAPIResponseTable(&out, body); err != nil { + t.Fatalf("writeAPIResponseTable() error = %v", err) + } + got := out.String() + for _, want := range []string{ + "site_id action note error", + "42 updated no active events returned", + "43 created trigger failed", + } { + if !strings.Contains(got, want) { + t.Fatalf("table missing %q:\n%s", want, got) + } + } +} + +func TestWriteAPIResponseTableIncludesSimulationSummaryColumns(t *testing.T) { + body := []byte(`{ + "mode": "http-500", + "sites": [ + { + "site_id": 42, + "action": "updated", + "trigger_status": "failed_http_500", + "event_ids": [99], + "event_states": ["Seems Down"], + "event_severities": [3], + "transition_count": 1 + } + ] + }`) + var out bytes.Buffer + if err := writeAPIResponseTable(&out, body); err != nil { + t.Fatalf("writeAPIResponseTable() error = %v", err) + } + got := out.String() + for _, want := range []string{ + "site_id action trigger_status event_ids event_states event_severities transition_count", + "42 updated failed_http_500 99 Seems Down 3 1", + } { + if !strings.Contains(got, want) { + t.Fatalf("table missing %q:\n%s", want, got) + } + } +} + +func TestWriteAPIResponseTableIncludesSmokeCleanupRows(t *testing.T) { + body := []byte(`{ + "steps": [ + {"name": "health", "status": "ok"}, + {"name": "me", "status": "ok"} + ], + "cleanup_results": [ + {"resource": "alert_contact", "id": 77, "status": "deleted"}, + {"resource": "site", "id": 910, "status": "failed", "error": "not found"} + ] + }`) + var out bytes.Buffer + if err := writeAPIResponseTable(&out, body); err != nil { + t.Fatalf("writeAPIResponseTable() error = %v", err) + } + got := out.String() + for _, want := range []string{ + "kind name id status detail", + "step health ok", + "cleanup alert_contact 77 deleted", + "cleanup site 910 failed not found", + } { + if !strings.Contains(got, want) { + t.Fatalf("table missing %q:\n%s", want, got) + } + } +} + +func TestWriteAPIResponseTableFallsBackToSortedColumns(t *testing.T) { + body := []byte(`{"zeta":"last","alpha":"first"}`) + var out bytes.Buffer + if err := writeAPIResponseTable(&out, body); err != nil { + t.Fatalf("writeAPIResponseTable() error = %v", err) + } + if got := out.String(); !strings.HasPrefix(got, "alpha zeta\n") { + t.Fatalf("table = %q, want sorted fallback columns", got) + } +} + +func TestWriteAPIOutputRejectsUnknownFormat(t *testing.T) { + err := writeAPIOutput(ioDiscard{}, []byte(`{"ok":true}`), apiCLIOptions{output: "yaml"}) + if err == nil { + t.Fatal("writeAPIOutput() error = nil, want bad output format") + } +} + +type ioDiscard struct{} + +func (ioDiscard) Write(p []byte) (int, error) { + return len(p), nil +} diff --git a/cmd/jetmon2/api_cli_webhooks.go b/cmd/jetmon2/api_cli_webhooks.go new file mode 100644 index 00000000..0295c9cb --- /dev/null +++ b/cmd/jetmon2/api_cli_webhooks.go @@ -0,0 +1,412 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "net/http" + "net/url" + "strconv" + "strings" +) + +type apiWebhookCreateOptions struct { + url string + active apiOptionalBoolFlag + events apiStringSliceFlags + siteIDs apiInt64SliceFlags + states apiStringSliceFlags +} + +type apiWebhookUpdateOptions struct { + url apiOptionalStringFlag + active apiOptionalBoolFlag + events apiStringSliceFlags + clearEvents bool + siteIDs apiInt64SliceFlags + clearSites bool + states apiStringSliceFlags + clearStates bool +} + +type apiWebhookDeliveriesFilters struct { + cursor string + limit int + status string +} + +type apiWebhookSiteFilter struct { + SiteIDs []int64 `json:"site_ids,omitempty"` +} + +type apiWebhookStateFilter struct { + States []string `json:"states,omitempty"` +} + +type apiWebhookCreateRequest struct { + URL string `json:"url"` + Active *bool `json:"active,omitempty"` + Events []string `json:"events"` + SiteFilter apiWebhookSiteFilter `json:"site_filter"` + StateFilter apiWebhookStateFilter `json:"state_filter"` +} + +type apiWebhookUpdateRequest struct { + URL *string `json:"url,omitempty"` + Active *bool `json:"active,omitempty"` + Events *[]string `json:"events,omitempty"` + SiteFilter *apiWebhookSiteFilter `json:"site_filter,omitempty"` + StateFilter *apiWebhookStateFilter `json:"state_filter,omitempty"` +} + +func cmdAPIWebhooks(args []string) error { + if len(args) == 0 { + return errors.New("usage: jetmon2 api webhooks [flags]") + } + + sub := args[0] + rest := args[1:] + switch sub { + case "list": + return cmdAPIWebhooksList(rest) + case "get": + return cmdAPIWebhooksGet(rest) + case "create": + return cmdAPIWebhooksCreate(rest) + case "update": + return cmdAPIWebhooksUpdate(rest) + case "delete": + return cmdAPIWebhooksDelete(rest) + case "rotate-secret": + return cmdAPIWebhooksRotateSecret(rest) + case "deliveries": + return cmdAPIWebhooksDeliveries(rest) + case "retry": + return cmdAPIWebhooksRetry(rest) + default: + return fmt.Errorf("unknown api webhooks subcommand %q (want: list, get, create, update, delete, rotate-secret, deliveries, retry)", sub) + } +} + +func cmdAPIWebhooksList(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api webhooks list", &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return errors.New("usage: jetmon2 api webhooks list [flags]") + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, "/api/v1/webhooks", nil) +} + +func cmdAPIWebhooksGet(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api webhooks get", &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api webhooks get [flags] ") + } + target, err := apiWebhookPath(fs.Arg(0), "") + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, target, nil) +} + +func cmdAPIWebhooksCreate(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api webhooks create", &opts) + addAPIIdempotencyFlag(fs, &opts) + create := apiWebhookCreateOptions{} + fs.StringVar(&create.url, "url", "", "webhook destination URL") + fs.Var(&create.active, "active", "webhook enabled: true or false") + fs.Var(&create.events, "event", "event type filter (repeatable or comma-separated)") + fs.Var(&create.siteIDs, "site-id", "site id filter (repeatable or comma-separated)") + fs.Var(&create.states, "state", "state filter (repeatable or comma-separated)") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return errors.New("usage: jetmon2 api webhooks create [flags]") + } + body, err := marshalAPIWebhookCreateBody(create) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodPost, "/api/v1/webhooks", body) +} + +func cmdAPIWebhooksUpdate(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api webhooks update", &opts) + update := apiWebhookUpdateOptions{} + fs.Var(&update.url, "url", "webhook destination URL") + fs.Var(&update.active, "active", "webhook enabled: true or false") + fs.Var(&update.events, "event", "event type filter (repeatable or comma-separated)") + fs.BoolVar(&update.clearEvents, "clear-events", false, "clear event filters") + fs.Var(&update.siteIDs, "site-id", "site id filter (repeatable or comma-separated)") + fs.BoolVar(&update.clearSites, "clear-sites", false, "clear site filters") + fs.Var(&update.states, "state", "state filter (repeatable or comma-separated)") + fs.BoolVar(&update.clearStates, "clear-states", false, "clear state filters") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api webhooks update [flags] ") + } + target, err := apiWebhookPath(fs.Arg(0), "") + if err != nil { + return err + } + body, err := marshalAPIWebhookUpdateBody(update) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodPatch, target, body) +} + +func cmdAPIWebhooksDelete(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api webhooks delete", &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api webhooks delete [flags] ") + } + target, err := apiWebhookPath(fs.Arg(0), "") + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodDelete, target, nil) +} + +func cmdAPIWebhooksRotateSecret(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api webhooks rotate-secret", &opts) + addAPIIdempotencyFlag(fs, &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api webhooks rotate-secret [flags] ") + } + target, err := apiWebhookPath(fs.Arg(0), "rotate-secret") + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodPost, target, nil) +} + +func cmdAPIWebhooksDeliveries(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api webhooks deliveries", &opts) + filters := apiWebhookDeliveriesFilters{} + fs.StringVar(&filters.cursor, "cursor", "", "pagination cursor") + fs.IntVar(&filters.limit, "limit", 0, "page size (1-200)") + fs.StringVar(&filters.status, "status", "", "delivery status: pending, delivered, failed, or abandoned") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 1 { + return errors.New("usage: jetmon2 api webhooks deliveries [flags] ") + } + target, err := apiWebhookDeliveriesPath(fs.Arg(0), filters) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodGet, target, nil) +} + +func cmdAPIWebhooksRetry(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api webhooks retry", &opts) + addAPIIdempotencyFlag(fs, &opts) + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 2 { + return errors.New("usage: jetmon2 api webhooks retry [flags] ") + } + target, err := apiWebhookRetryPath(fs.Arg(0), fs.Arg(1)) + if err != nil { + return err + } + return executeAPIRequest(context.Background(), nil, opts, http.MethodPost, target, nil) +} + +func apiWebhookPath(rawID, suffix string) (string, error) { + id, err := apiPositiveID(rawID, "webhook") + if err != nil { + return "", err + } + path := "/api/v1/webhooks/" + strconv.FormatInt(id, 10) + if suffix != "" { + path += "/" + strings.TrimPrefix(suffix, "/") + } + return path, nil +} + +func apiWebhookDeliveriesPath(rawID string, filters apiWebhookDeliveriesFilters) (string, error) { + path, err := apiWebhookPath(rawID, "deliveries") + if err != nil { + return "", err + } + if filters.limit < 0 { + return "", errors.New("limit must be positive") + } + + values := url.Values{} + if filters.cursor != "" { + values.Set("cursor", filters.cursor) + } + if filters.limit > 0 { + values.Set("limit", strconv.Itoa(filters.limit)) + } + if filters.status != "" { + switch filters.status { + case "pending", "delivered", "failed", "abandoned": + values.Set("status", filters.status) + default: + return "", errors.New("status must be one of: pending, delivered, failed, abandoned") + } + } + if len(values) == 0 { + return path, nil + } + return path + "?" + values.Encode(), nil +} + +func apiWebhookRetryPath(rawWebhookID, rawDeliveryID string) (string, error) { + webhookID, err := apiPositiveID(rawWebhookID, "webhook") + if err != nil { + return "", err + } + deliveryID, err := apiPositiveID(rawDeliveryID, "delivery") + if err != nil { + return "", err + } + return fmt.Sprintf("/api/v1/webhooks/%d/deliveries/%d/retry", webhookID, deliveryID), nil +} + +func marshalAPIWebhookCreateBody(opts apiWebhookCreateOptions) ([]byte, error) { + if strings.TrimSpace(opts.url) == "" { + return nil, errors.New("url is required") + } + req := apiWebhookCreateRequest{ + URL: opts.url, + Active: opts.active.ptr(), + Events: opts.events.valuesOrEmpty(), + SiteFilter: apiWebhookSiteFilter{SiteIDs: opts.siteIDs.valuesOrEmpty()}, + StateFilter: apiWebhookStateFilter{States: opts.states.valuesOrEmpty()}, + } + return json.Marshal(req) +} + +func marshalAPIWebhookUpdateBody(opts apiWebhookUpdateOptions) ([]byte, error) { + if opts.clearEvents && opts.events.set { + return nil, errors.New("use --event or --clear-events, not both") + } + if opts.clearSites && opts.siteIDs.set { + return nil, errors.New("use --site-id or --clear-sites, not both") + } + if opts.clearStates && opts.states.set { + return nil, errors.New("use --state or --clear-states, not both") + } + + req := apiWebhookUpdateRequest{ + URL: opts.url.ptr(), + Active: opts.active.ptr(), + } + if opts.events.set || opts.clearEvents { + events := opts.events.valuesOrEmpty() + req.Events = &events + } + if opts.siteIDs.set || opts.clearSites { + req.SiteFilter = &apiWebhookSiteFilter{SiteIDs: opts.siteIDs.valuesOrEmpty()} + } + if opts.states.set || opts.clearStates { + req.StateFilter = &apiWebhookStateFilter{States: opts.states.valuesOrEmpty()} + } + return json.Marshal(req) +} + +type apiStringSliceFlags struct { + values []string + set bool +} + +func (f *apiStringSliceFlags) Set(v string) error { + for _, part := range strings.Split(v, ",") { + part = strings.TrimSpace(part) + if part == "" { + continue + } + f.values = append(f.values, part) + f.set = true + } + return nil +} + +func (f *apiStringSliceFlags) String() string { + return strings.Join(f.values, ",") +} + +func (f apiStringSliceFlags) valuesOrEmpty() []string { + if !f.set { + return []string{} + } + out := make([]string, len(f.values)) + copy(out, f.values) + return out +} + +func (f apiStringSliceFlags) ptr() *[]string { + if !f.set { + return nil + } + out := f.valuesOrEmpty() + return &out +} + +type apiInt64SliceFlags struct { + values []int64 + set bool +} + +func (f *apiInt64SliceFlags) Set(v string) error { + for _, part := range strings.Split(v, ",") { + part = strings.TrimSpace(part) + if part == "" { + continue + } + id, err := apiPositiveID(part, "site") + if err != nil { + return err + } + f.values = append(f.values, id) + f.set = true + } + return nil +} + +func (f *apiInt64SliceFlags) String() string { + parts := make([]string, len(f.values)) + for i, v := range f.values { + parts[i] = strconv.FormatInt(v, 10) + } + return strings.Join(parts, ",") +} + +func (f apiInt64SliceFlags) valuesOrEmpty() []int64 { + if !f.set { + return []int64{} + } + out := make([]int64, len(f.values)) + copy(out, f.values) + return out +} diff --git a/cmd/jetmon2/api_cli_webhooks_test.go b/cmd/jetmon2/api_cli_webhooks_test.go new file mode 100644 index 00000000..85ce187c --- /dev/null +++ b/cmd/jetmon2/api_cli_webhooks_test.go @@ -0,0 +1,190 @@ +package main + +import ( + "encoding/json" + "net/url" + "testing" +) + +func TestMarshalAPIWebhookCreateBody(t *testing.T) { + var active apiOptionalBoolFlag + setTestFlag(t, &active, "false") + var events apiStringSliceFlags + setTestFlag(t, &events, "event.opened,event.closed") + var siteIDs apiInt64SliceFlags + setTestFlag(t, &siteIDs, "42,99") + var states apiStringSliceFlags + setTestFlag(t, &states, "Down") + + body, err := marshalAPIWebhookCreateBody(apiWebhookCreateOptions{ + url: "https://example.com/hook", + active: active, + events: events, + siteIDs: siteIDs, + states: states, + }) + if err != nil { + t.Fatalf("marshalAPIWebhookCreateBody() error = %v", err) + } + var got map[string]any + if err := json.Unmarshal(body, &got); err != nil { + t.Fatalf("unmarshal body: %v", err) + } + if got["url"] != "https://example.com/hook" { + t.Fatalf("url = %#v", got["url"]) + } + if got["active"] != false { + t.Fatalf("active = %#v, want false", got["active"]) + } + assertStringArray(t, got["events"], []string{"event.opened", "event.closed"}) + siteFilter := got["site_filter"].(map[string]any) + assertNumberArray(t, siteFilter["site_ids"], []int64{42, 99}) + stateFilter := got["state_filter"].(map[string]any) + assertStringArray(t, stateFilter["states"], []string{"Down"}) +} + +func TestMarshalAPIWebhookCreateBodyDefaultsFiltersToMatchAll(t *testing.T) { + body, err := marshalAPIWebhookCreateBody(apiWebhookCreateOptions{ + url: "https://example.com/hook", + }) + if err != nil { + t.Fatalf("marshalAPIWebhookCreateBody() error = %v", err) + } + var got map[string]any + if err := json.Unmarshal(body, &got); err != nil { + t.Fatalf("unmarshal body: %v", err) + } + assertStringArray(t, got["events"], []string{}) + if _, ok := got["site_filter"].(map[string]any)["site_ids"]; ok { + t.Fatalf("site_ids present in empty site_filter: %#v", got["site_filter"]) + } + if _, ok := got["state_filter"].(map[string]any)["states"]; ok { + t.Fatalf("states present in empty state_filter: %#v", got["state_filter"]) + } +} + +func TestMarshalAPIWebhookUpdateBodySupportsClears(t *testing.T) { + body, err := marshalAPIWebhookUpdateBody(apiWebhookUpdateOptions{ + clearEvents: true, + clearSites: true, + clearStates: true, + }) + if err != nil { + t.Fatalf("marshalAPIWebhookUpdateBody() error = %v", err) + } + var got map[string]any + if err := json.Unmarshal(body, &got); err != nil { + t.Fatalf("unmarshal body: %v", err) + } + assertStringArray(t, got["events"], []string{}) + if _, ok := got["site_filter"].(map[string]any)["site_ids"]; ok { + t.Fatalf("site_ids present in cleared site_filter: %#v", got["site_filter"]) + } + if _, ok := got["state_filter"].(map[string]any)["states"]; ok { + t.Fatalf("states present in cleared state_filter: %#v", got["state_filter"]) + } +} + +func TestMarshalAPIWebhookUpdateBodyRejectsClearConflicts(t *testing.T) { + var events apiStringSliceFlags + setTestFlag(t, &events, "event.opened") + if _, err := marshalAPIWebhookUpdateBody(apiWebhookUpdateOptions{events: events, clearEvents: true}); err == nil { + t.Fatal("events conflict error = nil, want error") + } + + var siteIDs apiInt64SliceFlags + setTestFlag(t, &siteIDs, "42") + if _, err := marshalAPIWebhookUpdateBody(apiWebhookUpdateOptions{siteIDs: siteIDs, clearSites: true}); err == nil { + t.Fatal("sites conflict error = nil, want error") + } + + var states apiStringSliceFlags + setTestFlag(t, &states, "Down") + if _, err := marshalAPIWebhookUpdateBody(apiWebhookUpdateOptions{states: states, clearStates: true}); err == nil { + t.Fatal("states conflict error = nil, want error") + } +} + +func TestAPIWebhookPaths(t *testing.T) { + got, err := apiWebhookPath("7", "rotate-secret") + if err != nil { + t.Fatalf("apiWebhookPath() error = %v", err) + } + if got != "/api/v1/webhooks/7/rotate-secret" { + t.Fatalf("path = %q, want rotate-secret path", got) + } + + got, err = apiWebhookRetryPath("7", "44") + if err != nil { + t.Fatalf("apiWebhookRetryPath() error = %v", err) + } + if got != "/api/v1/webhooks/7/deliveries/44/retry" { + t.Fatalf("retry path = %q, want delivery retry path", got) + } +} + +func TestAPIWebhookDeliveriesPath(t *testing.T) { + got, err := apiWebhookDeliveriesPath("7", apiWebhookDeliveriesFilters{ + cursor: "cur-4", + limit: 25, + status: "abandoned", + }) + if err != nil { + t.Fatalf("apiWebhookDeliveriesPath() error = %v", err) + } + u, err := url.Parse(got) + if err != nil { + t.Fatalf("parse path: %v", err) + } + if u.Path != "/api/v1/webhooks/7/deliveries" { + t.Fatalf("path = %q, want deliveries path", u.Path) + } + for key, want := range map[string]string{ + "cursor": "cur-4", + "limit": "25", + "status": "abandoned", + } { + if got := u.Query().Get(key); got != want { + t.Fatalf("query %s = %q, want %q", key, got, want) + } + } +} + +func TestAPIWebhookDeliveriesPathRejectsBadStatus(t *testing.T) { + _, err := apiWebhookDeliveriesPath("7", apiWebhookDeliveriesFilters{status: "waiting"}) + if err == nil { + t.Fatal("apiWebhookDeliveriesPath() error = nil, want bad status error") + } +} + +func assertStringArray(t *testing.T, got any, want []string) { + t.Helper() + items, ok := got.([]any) + if !ok { + t.Fatalf("value = %#v, want JSON array", got) + } + if len(items) != len(want) { + t.Fatalf("array len = %d, want %d: %#v", len(items), len(want), items) + } + for i, wantItem := range want { + if items[i] != wantItem { + t.Fatalf("array[%d] = %#v, want %q", i, items[i], wantItem) + } + } +} + +func assertNumberArray(t *testing.T, got any, want []int64) { + t.Helper() + items, ok := got.([]any) + if !ok { + t.Fatalf("value = %#v, want JSON array", got) + } + if len(items) != len(want) { + t.Fatalf("array len = %d, want %d: %#v", len(items), len(want), items) + } + for i, wantItem := range want { + if items[i] != float64(wantItem) { + t.Fatalf("array[%d] = %#v, want %d", i, items[i], wantItem) + } + } +} diff --git a/cmd/jetmon2/api_cli_workflows.go b/cmd/jetmon2/api_cli_workflows.go new file mode 100644 index 00000000..bce7d9ca --- /dev/null +++ b/cmd/jetmon2/api_cli_workflows.go @@ -0,0 +1,852 @@ +package main + +import ( + "context" + "encoding/json" + "errors" + "fmt" + "hash/fnv" + "io" + "net/http" + "net/url" + "strconv" + "strings" + "time" +) + +const ( + apiCLIBatchHeader = "X-Jetmon-CLI-Batch" + apiSmokeDefaultURL = "https://example.com/" + apiSmokeDefaultKeyword = "Example Domain" + apiSmokeAlertTestEmail = "jetmon-api-cli@example.invalid" + apiSmokeDefaultExercise = "alert-contact" + apiSmokeWebhookEvent = "event.opened" + apiSmokeWebhookState = "Seems Down" + apiSmokeWebhookMode = "http-500" + + defaultAPIFixtureWebhookURL = "http://api-fixture:8091/webhook" + defaultAPIFixtureWebhookRequestsURL = "http://localhost:18091/webhook/requests" +) + +type apiSmokeOptions struct { + batch string + blogID int64 + url string + cleanup bool + exercise string + idempotencyKeyPrefix string + webhookURL string + webhookRequestsURL string + webhookWait time.Duration + webhookPollInterval time.Duration + fixtureURL string + fixtureProbeURL string + allowExternalWebhook bool +} + +type apiSmokeSummary struct { + Batch string `json:"batch"` + BlogID int64 `json:"blog_id"` + BaseURL string `json:"base_url"` + Cleanup bool `json:"cleanup"` + Steps []apiSmokeStep `json:"steps"` + Site json.RawMessage `json:"site,omitempty"` + TriggerNow json.RawMessage `json:"trigger_now,omitempty"` + Events json.RawMessage `json:"events,omitempty"` + AlertContact json.RawMessage `json:"alert_contact,omitempty"` + AlertTest json.RawMessage `json:"alert_test,omitempty"` + Webhook *apiSmokeWebhookSummary `json:"webhook,omitempty"` + WebhookDelivery json.RawMessage `json:"webhook_delivery,omitempty"` + WebhookFixture *apiSmokeWebhookFixtureSummary `json:"webhook_fixture,omitempty"` + FailureSimulation *apiSimulatedSiteResult `json:"failure_simulation,omitempty"` + CleanupResults []apiSmokeCleanupResult `json:"cleanup_results,omitempty"` +} + +type apiSmokeStep struct { + Name string `json:"name"` + Status string `json:"status"` + Detail string `json:"detail,omitempty"` +} + +type apiSmokeCleanupResult struct { + Resource string `json:"resource"` + ID int64 `json:"id"` + Status string `json:"status"` + Error string `json:"error,omitempty"` +} + +type apiSmokeWebhookSummary struct { + ID int64 `json:"id"` + URL string `json:"url"` + Active bool `json:"active"` + Events []string `json:"events,omitempty"` + SecretPreview string `json:"secret_preview,omitempty"` +} + +type apiSmokeWebhookFixtureSummary struct { + Requests int `json:"requests"` + MatchedDeliveryID string `json:"matched_delivery_id,omitempty"` + MatchedEvent string `json:"matched_event,omitempty"` + SignatureVerified bool `json:"signature_verified"` +} + +type apiSmokeFixtureResponse struct { + Count int `json:"count"` + Requests []apiSmokeFixtureWebhookHit `json:"requests"` +} + +type apiSmokeFixtureWebhookHit struct { + ID int `json:"id"` + Event string `json:"event,omitempty"` + Delivery string `json:"delivery,omitempty"` + Signature string `json:"signature,omitempty"` + SignatureValid *bool `json:"signature_valid,omitempty"` + Body string `json:"body"` +} + +type apiWorkflowHTTPError struct { + Method string + Target string + Status string + Body []byte +} + +func (e apiWorkflowHTTPError) Error() string { + body := strings.TrimSpace(string(e.Body)) + if len(body) > 300 { + body = body[:300] + "..." + } + if body == "" { + return fmt.Sprintf("%s %s returned %s", e.Method, e.Target, e.Status) + } + return fmt.Sprintf("%s %s returned %s: %s", e.Method, e.Target, e.Status, body) +} + +func cmdAPISmoke(args []string) error { + opts := defaultAPIOptions() + fs := newAPIFlagSet("api smoke", &opts) + smoke := apiSmokeOptions{ + url: apiSmokeDefaultURL, + cleanup: true, + exercise: apiSmokeDefaultExercise, + webhookURL: envOrDefault( + "JETMON_API_WEBHOOK_FIXTURE_URL", + defaultAPIFixtureWebhookURL, + ), + webhookRequestsURL: envOrDefault( + "JETMON_API_WEBHOOK_FIXTURE_REQUESTS_URL", + defaultAPIFixtureWebhookRequestsURL, + ), + webhookWait: 60 * time.Second, + webhookPollInterval: 2 * time.Second, + fixtureURL: envOrDefault("JETMON_API_FIXTURE_URL", apiFixtureAuto), + fixtureProbeURL: envOrDefault( + "JETMON_API_FIXTURE_PROBE_URL", + defaultAPIFixtureProbeURL, + ), + } + fs.StringVar(&smoke.batch, "batch", "", "stable batch label for generated test resources") + fs.Int64Var(&smoke.blogID, "blog-id", 0, "specific blog_id to create; default derives from --batch") + fs.StringVar(&smoke.url, "url", smoke.url, "site monitor URL to create") + fs.BoolVar(&smoke.cleanup, "cleanup", smoke.cleanup, "delete smoke-created resources before exit") + fs.StringVar(&smoke.exercise, "exercise", smoke.exercise, "extra path to exercise: alert-contact, webhook, or none") + fs.StringVar(&smoke.idempotencyKeyPrefix, "idempotency-key-prefix", "", "prefix for smoke POST Idempotency-Key headers") + fs.StringVar(&smoke.webhookURL, "webhook-url", smoke.webhookURL, "receiver URL to register when --exercise=webhook") + fs.StringVar(&smoke.webhookRequestsURL, "webhook-requests-url", smoke.webhookRequestsURL, "local fixture requests URL to poll when --exercise=webhook") + fs.DurationVar(&smoke.webhookWait, "webhook-wait", smoke.webhookWait, "maximum wait for webhook delivery when --exercise=webhook") + fs.DurationVar(&smoke.webhookPollInterval, "webhook-poll-interval", smoke.webhookPollInterval, "poll interval for webhook delivery checks") + fs.StringVar(&smoke.fixtureURL, "fixture-url", smoke.fixtureURL, "Docker fixture monitor URL, auto, or off when --exercise=webhook") + fs.StringVar(&smoke.fixtureProbeURL, "fixture-probe-url", smoke.fixtureProbeURL, "URL used when --fixture-url=auto") + fs.BoolVar(&smoke.allowExternalWebhook, "allow-external-webhook-url", false, "allow --exercise=webhook to register a receiver URL outside localhost, loopback, or api-fixture") + if err := parseAPIFlags(fs, args); err != nil { + return err + } + if fs.NArg() != 0 { + return errors.New("usage: jetmon2 api smoke [flags]") + } + return runAPISmoke(context.Background(), nil, opts, smoke) +} + +func runAPISmoke(ctx context.Context, client *http.Client, opts apiCLIOptions, smoke apiSmokeOptions) error { + if opts.out == nil { + opts.out = io.Discard + } + remote, err := requireAPILocalOrAllowRemote(opts, opts.allowRemote, "api smoke") + if err != nil { + return err + } + if remote && strings.TrimSpace(smoke.batch) == "" { + return errors.New("api smoke requires --batch when --allow-remote targets a non-local API") + } + if smoke.batch == "" { + smoke.batch = apiCLINewBatchID("smoke") + } + if smoke.blogID == 0 { + smoke.blogID = apiCLIBatchBlogIDStart(smoke.batch) + } + if smoke.url == "" { + smoke.url = apiSmokeDefaultURL + } + if smoke.exercise == "" { + smoke.exercise = apiSmokeDefaultExercise + } + if smoke.webhookURL == "" { + smoke.webhookURL = defaultAPIFixtureWebhookURL + } + if smoke.webhookRequestsURL == "" { + smoke.webhookRequestsURL = defaultAPIFixtureWebhookRequestsURL + } + if smoke.webhookWait == 0 { + smoke.webhookWait = 60 * time.Second + } + if smoke.webhookPollInterval == 0 { + smoke.webhookPollInterval = 2 * time.Second + } + if smoke.fixtureURL == "" { + smoke.fixtureURL = apiFixtureAuto + } + if smoke.fixtureProbeURL == "" { + smoke.fixtureProbeURL = defaultAPIFixtureProbeURL + } + if smoke.exercise != "alert-contact" && smoke.exercise != "webhook" && smoke.exercise != "none" { + return errors.New("exercise must be one of: alert-contact, webhook, none") + } + if remote && smoke.exercise == "webhook" { + return errors.New("api smoke --exercise webhook is Docker-local only and refuses non-local API targets") + } + if smoke.exercise == "webhook" { + if err := requireAPIWebhookFixtureURLAllowed(smoke.webhookURL, smoke.allowExternalWebhook); err != nil { + return err + } + if err := requireAPIWebhookFixtureRequestsLocal(smoke.webhookRequestsURL); err != nil { + return err + } + } + if smoke.webhookWait <= 0 { + return errors.New("webhook-wait must be positive") + } + if smoke.webhookPollInterval <= 0 { + return errors.New("webhook-poll-interval must be positive") + } + + summary := apiSmokeSummary{ + Batch: smoke.batch, + BlogID: smoke.blogID, + BaseURL: opts.baseURL, + Cleanup: smoke.cleanup, + } + var createdContactID int64 + var createdWebhookID int64 + siteCreated := false + + cleanup := func() { + if !smoke.cleanup { + return + } + if createdWebhookID > 0 { + target := "/api/v1/webhooks/" + strconv.FormatInt(createdWebhookID, 10) + err := apiWorkflowDelete(ctx, client, opts, target) + result := apiSmokeCleanupResult{Resource: "webhook", ID: createdWebhookID, Status: "deleted"} + if err != nil { + result.Status = "failed" + result.Error = err.Error() + } + summary.CleanupResults = append(summary.CleanupResults, result) + } + if createdContactID > 0 { + target := "/api/v1/alert-contacts/" + strconv.FormatInt(createdContactID, 10) + err := apiWorkflowDelete(ctx, client, opts, target) + result := apiSmokeCleanupResult{Resource: "alert_contact", ID: createdContactID, Status: "deleted"} + if err != nil { + result.Status = "failed" + result.Error = err.Error() + } + summary.CleanupResults = append(summary.CleanupResults, result) + } + if siteCreated { + target := "/api/v1/sites/" + strconv.FormatInt(smoke.blogID, 10) + err := apiWorkflowDelete(ctx, client, opts, target) + result := apiSmokeCleanupResult{Resource: "site", ID: smoke.blogID, Status: "deleted"} + if err != nil { + result.Status = "failed" + result.Error = err.Error() + } + summary.CleanupResults = append(summary.CleanupResults, result) + } + } + + step := func(name string, fn func() error) error { + if err := fn(); err != nil { + summary.Steps = append(summary.Steps, apiSmokeStep{Name: name, Status: "failed", Detail: err.Error()}) + cleanup() + _ = writeAPIValueOutput(opts.out, summary, opts) + return fmt.Errorf("smoke %s failed: %w", name, err) + } + summary.Steps = append(summary.Steps, apiSmokeStep{Name: name, Status: "ok"}) + return nil + } + + if err := step("health", func() error { + _, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodGet, "/api/v1/health", nil, "") + return err + }); err != nil { + return err + } + if err := step("me", func() error { + _, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodGet, "/api/v1/me", nil, "") + return err + }); err != nil { + return err + } + if err := step("create_site", func() error { + keyword := apiSmokeDefaultKeyword + redirectPolicy := "follow" + checkInterval := 5 + headers := map[string]string{apiCLIBatchHeader: smoke.batch} + site, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodPost, "/api/v1/sites", apiSiteCreateRequest{ + BlogID: smoke.blogID, + MonitorURL: smoke.url, + CheckKeyword: &keyword, + RedirectPolicy: &redirectPolicy, + CheckInterval: &checkInterval, + CustomHeaders: &headers, + }, apiSmokeIDKey(smoke, "create-site")) + if err != nil { + return err + } + siteCreated = true + summary.Site = site + return nil + }); err != nil { + return err + } + if err := step("trigger_now", func() error { + body, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodPost, fmt.Sprintf("/api/v1/sites/%d/trigger-now", smoke.blogID), nil, apiSmokeIDKey(smoke, "trigger-now")) + if err != nil { + return err + } + summary.TriggerNow = body + return nil + }); err != nil { + return err + } + if err := step("events", func() error { + body, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodGet, fmt.Sprintf("/api/v1/sites/%d/events?limit=5", smoke.blogID), nil, "") + if err != nil { + return err + } + summary.Events = body + return nil + }); err != nil { + return err + } + if smoke.exercise == "alert-contact" { + if err := step("create_alert_contact", func() error { + contact, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodPost, "/api/v1/alert-contacts", apiAlertContactCreateRequest{ + Label: "api-cli-smoke-" + smoke.batch, + Transport: "email", + Destination: json.RawMessage(`{"address":"` + apiSmokeAlertTestEmail + `"}`), + SiteFilter: apiAlertContactSiteFilter{SiteIDs: []int64{smoke.blogID}}, + }, apiSmokeIDKey(smoke, "create-alert-contact")) + if err != nil { + return err + } + id, err := apiJSONInt64(contact, "id") + if err != nil { + return err + } + createdContactID = id + summary.AlertContact = contact + return nil + }); err != nil { + return err + } + if err := step("alert_contact_test", func() error { + body, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodPost, fmt.Sprintf("/api/v1/alert-contacts/%d/test", createdContactID), nil, apiSmokeIDKey(smoke, "alert-contact-test")) + if err != nil { + return err + } + summary.AlertTest = body + return nil + }); err != nil { + return err + } + } + if smoke.exercise == "webhook" { + var webhookSecret string + if err := step("webhook_clear_fixture", func() error { + return clearAPIWebhookFixtureRequests(ctx, client, opts, smoke.webhookRequestsURL) + }); err != nil { + return err + } + if err := step("create_webhook", func() error { + active := false + hook, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodPost, "/api/v1/webhooks", apiWebhookCreateRequest{ + URL: strings.TrimSpace(smoke.webhookURL), + Active: &active, + Events: []string{apiSmokeWebhookEvent}, + SiteFilter: apiWebhookSiteFilter{ + SiteIDs: []int64{smoke.blogID}, + }, + StateFilter: apiWebhookStateFilter{ + States: []string{apiSmokeWebhookState}, + }, + }, apiSmokeIDKey(smoke, "create-webhook")) + if err != nil { + return err + } + id, err := apiJSONInt64(hook, "id") + if err != nil { + return err + } + secret, err := apiJSONString(hook, "secret") + if err != nil { + return err + } + createdWebhookID = id + webhookSecret = secret + summary.Webhook = redactedAPIWebhookSummary(hook) + return nil + }); err != nil { + return err + } + if err := step("activate_webhook_signature_fixture", func() error { + signedURL, err := apiWebhookFixtureURLWithSecret(smoke.webhookURL, webhookSecret) + if err != nil { + return err + } + active := true + body, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodPatch, fmt.Sprintf("/api/v1/webhooks/%d", createdWebhookID), apiWebhookUpdateRequest{ + URL: &signedURL, + Active: &active, + }, "") + if err != nil { + return redactAPISecretError(err, webhookSecret) + } + summary.Webhook = redactedAPIWebhookSummary(body) + return nil + }); err != nil { + return err + } + if err := step("simulate_failure_for_webhook", func() error { + result, err := runAPISmokeWebhookFailureSimulation(ctx, client, opts, smoke) + if err != nil { + return err + } + summary.FailureSimulation = &result + return nil + }); err != nil { + return err + } + if err := step("webhook_fixture_delivery", func() error { + fixture, err := waitForAPIWebhookFixtureDelivery(ctx, client, opts, smoke) + if err != nil { + return err + } + summary.WebhookFixture = fixture + return nil + }); err != nil { + return err + } + if err := step("webhook_delivery_row", func() error { + body, err := waitForAPIWebhookDeliveredRow(ctx, client, opts, createdWebhookID, smoke, summary.WebhookFixture.MatchedDeliveryID) + if err != nil { + return err + } + summary.WebhookDelivery = body + return nil + }); err != nil { + return err + } + } + + cleanup() + return writeAPIValueOutput(opts.out, summary, opts) +} + +func apiWorkflowRequestJSON(ctx context.Context, client *http.Client, opts apiCLIOptions, method, target string, body any, idempotencyKey string) (json.RawMessage, error) { + var payload []byte + var err error + if body != nil { + payload, err = json.Marshal(body) + if err != nil { + return nil, err + } + } + requestOpts := opts + requestOpts.idempotencyKey = idempotencyKey + resp, err := doAPIRequest(ctx, client, requestOpts, method, target, payload) + if err != nil { + return nil, err + } + trimmed := json.RawMessage(strings.TrimSpace(string(resp.Body))) + if len(trimmed) == 0 { + trimmed = json.RawMessage(`null`) + } + if resp.StatusCode >= 400 { + return trimmed, apiWorkflowHTTPError{Method: method, Target: target, Status: resp.Status, Body: resp.Body} + } + return trimmed, nil +} + +func apiWorkflowDelete(ctx context.Context, client *http.Client, opts apiCLIOptions, target string) error { + resp, err := doAPIRequest(ctx, client, opts, http.MethodDelete, target, nil) + if err != nil { + return err + } + if resp.StatusCode >= 400 { + return apiWorkflowHTTPError{Method: http.MethodDelete, Target: target, Status: resp.Status, Body: resp.Body} + } + return nil +} + +func runAPISmokeWebhookFailureSimulation(ctx context.Context, client *http.Client, opts apiCLIOptions, smoke apiSmokeOptions) (apiSimulatedSiteResult, error) { + sim := apiSitesSimulateFailureOptions{ + mode: apiSmokeWebhookMode, + batch: smoke.batch, + count: 1, + blogIDStart: smoke.blogID, + createMissing: false, + trigger: true, + wait: smoke.webhookWait, + pollInterval: smoke.webhookPollInterval, + idempotencyKeyPrefix: smoke.idempotencyKeyPrefix, + fixtureURL: smoke.fixtureURL, + fixtureProbeURL: smoke.fixtureProbeURL, + expectEventState: apiSmokeWebhookState, + requireTransition: true, + expectTransitionReason: "opened", + } + sim.expectEventSeverity.set = true + sim.expectEventSeverity.value = 3 + fixtureURL := apiSimulationFixtureURL(ctx, sim) + if fixtureURL == "" { + return apiSimulatedSiteResult{}, errors.New("Docker API fixture is required for --exercise=webhook; start api-fixture or pass --fixture-url") + } + def, err := apiFailureMode(sim.mode, fixtureURL) + if err != nil { + return apiSimulatedSiteResult{}, err + } + return runAPISiteSimulation(ctx, client, opts, sim, def, smoke.blogID, 0) +} + +func clearAPIWebhookFixtureRequests(ctx context.Context, client *http.Client, opts apiCLIOptions, requestsURL string) error { + req, err := http.NewRequestWithContext(ctx, http.MethodDelete, strings.TrimSpace(requestsURL), nil) + if err != nil { + return err + } + resp, err := apiExternalHTTPClient(client, opts).Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode >= 400 { + body, _ := io.ReadAll(io.LimitReader(resp.Body, 300)) + return apiWorkflowHTTPError{Method: http.MethodDelete, Target: requestsURL, Status: resp.Status, Body: body} + } + return nil +} + +func waitForAPIWebhookFixtureDelivery(ctx context.Context, client *http.Client, opts apiCLIOptions, smoke apiSmokeOptions) (*apiSmokeWebhookFixtureSummary, error) { + deadline := time.Now().Add(smoke.webhookWait) + for { + fixture, err := getAPIWebhookFixtureRequests(ctx, client, opts, smoke.webhookRequestsURL) + if err != nil { + return nil, err + } + if summary := matchingAPIWebhookFixtureDelivery(fixture, smoke.blogID); summary != nil { + return summary, nil + } + if time.Now().After(deadline) { + return nil, fmt.Errorf("timed out waiting for verified webhook fixture delivery for site %d", smoke.blogID) + } + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(smoke.webhookPollInterval): + } + } +} + +func getAPIWebhookFixtureRequests(ctx context.Context, client *http.Client, opts apiCLIOptions, requestsURL string) (apiSmokeFixtureResponse, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimSpace(requestsURL), nil) + if err != nil { + return apiSmokeFixtureResponse{}, err + } + resp, err := apiExternalHTTPClient(client, opts).Do(req) + if err != nil { + return apiSmokeFixtureResponse{}, err + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + return apiSmokeFixtureResponse{}, err + } + if resp.StatusCode >= 400 { + return apiSmokeFixtureResponse{}, apiWorkflowHTTPError{Method: http.MethodGet, Target: requestsURL, Status: resp.Status, Body: body} + } + var fixture apiSmokeFixtureResponse + if err := json.Unmarshal(body, &fixture); err != nil { + return apiSmokeFixtureResponse{}, err + } + return fixture, nil +} + +func matchingAPIWebhookFixtureDelivery(fixture apiSmokeFixtureResponse, siteID int64) *apiSmokeWebhookFixtureSummary { + for _, req := range fixture.Requests { + if req.SignatureValid == nil || !*req.SignatureValid { + continue + } + var body struct { + Type string `json:"type"` + SiteID int64 `json:"site_id"` + } + if err := json.Unmarshal([]byte(req.Body), &body); err != nil { + continue + } + if body.Type != apiSmokeWebhookEvent || body.SiteID != siteID { + continue + } + if strings.TrimSpace(req.Delivery) == "" { + continue + } + return &apiSmokeWebhookFixtureSummary{ + Requests: fixture.Count, + MatchedDeliveryID: req.Delivery, + MatchedEvent: req.Event, + SignatureVerified: true, + } + } + return nil +} + +func waitForAPIWebhookDeliveredRow(ctx context.Context, client *http.Client, opts apiCLIOptions, webhookID int64, smoke apiSmokeOptions, expectedDeliveryID string) (json.RawMessage, error) { + deadline := time.Now().Add(smoke.webhookWait) + target := fmt.Sprintf("/api/v1/webhooks/%d/deliveries?status=delivered&limit=10", webhookID) + for { + body, err := apiWorkflowRequestJSON(ctx, client, opts, http.MethodGet, target, nil, "") + if err != nil { + return nil, err + } + if apiDeliveredWebhookRowsIncludeSite(body, smoke.blogID, expectedDeliveryID) { + return body, nil + } + if time.Now().After(deadline) { + if strings.TrimSpace(expectedDeliveryID) != "" { + return nil, fmt.Errorf("timed out waiting for delivered webhook row %s for webhook %d and site %d", expectedDeliveryID, webhookID, smoke.blogID) + } + return nil, fmt.Errorf("timed out waiting for delivered webhook row for webhook %d and site %d", webhookID, smoke.blogID) + } + select { + case <-ctx.Done(): + return nil, ctx.Err() + case <-time.After(smoke.webhookPollInterval): + } + } +} + +func apiDeliveredWebhookRowsIncludeSite(body json.RawMessage, siteID int64, expectedDeliveryID string) bool { + var envelope struct { + Data []struct { + ID int64 `json:"id"` + Status string `json:"status"` + Payload json.RawMessage `json:"payload"` + } `json:"data"` + } + if err := json.Unmarshal(body, &envelope); err != nil { + return false + } + for _, row := range envelope.Data { + if row.Status != "delivered" { + continue + } + var payload struct { + Type string `json:"type"` + SiteID int64 `json:"site_id"` + } + if err := json.Unmarshal(row.Payload, &payload); err != nil { + continue + } + if payload.Type == apiSmokeWebhookEvent && payload.SiteID == siteID && apiDeliveryIDMatches(row.ID, expectedDeliveryID) { + return true + } + } + return false +} + +func apiDeliveryIDMatches(rowID int64, expectedDeliveryID string) bool { + expectedDeliveryID = strings.TrimSpace(expectedDeliveryID) + if expectedDeliveryID == "" { + return true + } + expected, err := strconv.ParseInt(expectedDeliveryID, 10, 64) + if err != nil { + return false + } + return rowID == expected +} + +func apiWebhookFixtureURLWithSecret(rawURL, secret string) (string, error) { + if strings.TrimSpace(secret) == "" { + return "", errors.New("webhook secret is empty") + } + u, err := url.Parse(strings.TrimSpace(rawURL)) + if err != nil { + return "", err + } + if !u.IsAbs() || u.Host == "" { + return "", errors.New("webhook-url must be absolute") + } + q := u.Query() + q.Set("secret", secret) + u.RawQuery = q.Encode() + return u.String(), nil +} + +func apiExternalHTTPClient(client *http.Client, opts apiCLIOptions) *http.Client { + if client != nil { + return client + } + timeout := opts.timeout + if timeout <= 0 { + timeout = 10 * time.Second + } + return &http.Client{Timeout: timeout} +} + +func requireAPIWebhookFixtureURLAllowed(rawURL string, allowExternal bool) error { + u, err := url.Parse(strings.TrimSpace(rawURL)) + if err != nil { + return fmt.Errorf("invalid webhook-url: %w", err) + } + if !u.IsAbs() || u.Host == "" { + return errors.New("webhook-url must be absolute") + } + if allowExternal { + return nil + } + host := strings.ToLower(strings.TrimSuffix(u.Hostname(), ".")) + if host == "api-fixture" { + return nil + } + local, err := isLocalAPIURL(rawURL) + if err != nil { + return fmt.Errorf("invalid webhook-url: %w", err) + } + if local { + return nil + } + return fmt.Errorf("webhook-url must be localhost, loopback, or api-fixture for api smoke --exercise webhook; pass --allow-external-webhook-url to register %q", rawURL) +} + +func requireAPIWebhookFixtureRequestsLocal(rawURL string) error { + local, err := isLocalAPIURL(rawURL) + if err != nil { + return fmt.Errorf("invalid webhook-requests-url: %w", err) + } + if !local { + return fmt.Errorf("webhook-requests-url must be local for api smoke --exercise webhook: %q", rawURL) + } + return nil +} + +func apiJSONInt64(body json.RawMessage, field string) (int64, error) { + var obj map[string]any + if err := json.Unmarshal(body, &obj); err != nil { + return 0, err + } + raw, ok := obj[field] + if !ok { + return 0, fmt.Errorf("response missing %q", field) + } + switch v := raw.(type) { + case float64: + return int64(v), nil + default: + return 0, fmt.Errorf("response field %q is %T, want number", field, raw) + } +} + +func apiJSONString(body json.RawMessage, field string) (string, error) { + var obj map[string]any + if err := json.Unmarshal(body, &obj); err != nil { + return "", err + } + raw, ok := obj[field] + if !ok { + return "", fmt.Errorf("response missing %q", field) + } + value, ok := raw.(string) + if !ok { + return "", fmt.Errorf("response field %q is %T, want string", field, raw) + } + return value, nil +} + +func redactAPISecretError(err error, secret string) error { + if err == nil { + return nil + } + secret = strings.TrimSpace(secret) + if secret == "" { + return err + } + msg := err.Error() + msg = strings.ReplaceAll(msg, secret, "redacted") + msg = strings.ReplaceAll(msg, url.QueryEscape(secret), "redacted") + if msg == err.Error() { + return err + } + return errors.New(msg) +} + +func redactedAPIWebhookSummary(body json.RawMessage) *apiSmokeWebhookSummary { + var hook struct { + ID int64 `json:"id"` + URL string `json:"url"` + Active bool `json:"active"` + Events []string `json:"events"` + SecretPreview string `json:"secret_preview"` + } + if err := json.Unmarshal(body, &hook); err != nil { + return nil + } + return &apiSmokeWebhookSummary{ + ID: hook.ID, + URL: redactedWebhookFixtureURL(hook.URL), + Active: hook.Active, + Events: hook.Events, + SecretPreview: hook.SecretPreview, + } +} + +func redactedWebhookFixtureURL(rawURL string) string { + u, err := url.Parse(rawURL) + if err != nil { + return rawURL + } + if u.Query().Has("secret") { + q := u.Query() + q.Set("secret", "redacted") + u.RawQuery = q.Encode() + } + return u.String() +} + +func apiSmokeIDKey(smoke apiSmokeOptions, suffix string) string { + if smoke.idempotencyKeyPrefix == "" { + return "" + } + return smoke.idempotencyKeyPrefix + "-" + suffix +} + +func apiCLINewBatchID(prefix string) string { + return fmt.Sprintf("%s-%s", prefix, time.Now().UTC().Format("20060102T150405Z")) +} + +func apiCLIBatchBlogIDStart(batch string) int64 { + h := fnv.New32a() + _, _ = h.Write([]byte(batch)) + // Reserve a deterministic 1,000-id slot in the high local-test range. + return 910000000 + int64(h.Sum32()%90000)*1000 +} diff --git a/cmd/jetmon2/api_cli_workflows_test.go b/cmd/jetmon2/api_cli_workflows_test.go new file mode 100644 index 00000000..75137f25 --- /dev/null +++ b/cmd/jetmon2/api_cli_workflows_test.go @@ -0,0 +1,477 @@ +package main + +import ( + "bytes" + "context" + "crypto/hmac" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "io" + "net/http" + "net/http/httptest" + "strings" + "sync" + "testing" + "time" +) + +func TestRunAPISmokeHappyPath(t *testing.T) { + var calls []string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + calls = append(calls, r.Method+" "+r.URL.Path) + if r.URL.Path != "/api/v1/health" && r.Header.Get("Authorization") != "Bearer token-123" { + t.Fatalf("missing auth for %s %s", r.Method, r.URL.Path) + } + switch { + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/health": + writeTestJSON(t, w, map[string]string{"status": "ok"}) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/me": + writeTestJSON(t, w, map[string]any{"consumer_name": "api-cli-test", "scope": "admin"}) + case r.Method == http.MethodPost && r.URL.Path == "/api/v1/sites": + var body map[string]any + decodeTestJSON(t, r, &body) + if body["blog_id"] != float64(910) { + t.Fatalf("blog_id = %#v, want 910", body["blog_id"]) + } + headers := body["custom_headers"].(map[string]any) + if headers[apiCLIBatchHeader] != "smoke-test" { + t.Fatalf("batch header = %#v, want smoke-test", headers[apiCLIBatchHeader]) + } + writeTestStatusJSON(t, w, http.StatusCreated, map[string]any{"id": 910, "blog_id": 910}) + case r.Method == http.MethodPost && r.URL.Path == "/api/v1/sites/910/trigger-now": + writeTestJSON(t, w, map[string]any{"result": map[string]any{"success": true}}) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/sites/910/events": + writeTestJSON(t, w, map[string]any{"data": []any{}, "page": map[string]any{"limit": 5}}) + case r.Method == http.MethodPost && r.URL.Path == "/api/v1/alert-contacts": + writeTestStatusJSON(t, w, http.StatusCreated, map[string]any{"id": 77, "label": "api-cli-smoke-smoke-test"}) + case r.Method == http.MethodPost && r.URL.Path == "/api/v1/alert-contacts/77/test": + writeTestJSON(t, w, map[string]any{"contact_id": 77, "delivered": true}) + case r.Method == http.MethodDelete && r.URL.Path == "/api/v1/alert-contacts/77": + w.WriteHeader(http.StatusNoContent) + case r.Method == http.MethodDelete && r.URL.Path == "/api/v1/sites/910": + w.WriteHeader(http.StatusNoContent) + default: + t.Fatalf("unexpected request: %s %s", r.Method, r.URL.Path) + } + })) + defer srv.Close() + + var stdout bytes.Buffer + err := runAPISmoke(context.Background(), srv.Client(), apiCLIOptions{ + baseURL: srv.URL, + token: "token-123", + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + }, apiSmokeOptions{ + batch: "smoke-test", + blogID: 910, + url: "https://example.com/", + cleanup: true, + exercise: "alert-contact", + }) + if err != nil { + t.Fatalf("runAPISmoke() error = %v\nstdout=%s", err, stdout.String()) + } + + var summary apiSmokeSummary + if err := json.Unmarshal(stdout.Bytes(), &summary); err != nil { + t.Fatalf("unmarshal summary: %v\n%s", err, stdout.String()) + } + if summary.Batch != "smoke-test" || summary.BlogID != 910 { + t.Fatalf("summary batch/id = %q/%d", summary.Batch, summary.BlogID) + } + if len(summary.Steps) != 7 { + t.Fatalf("steps = %#v, want 7 steps", summary.Steps) + } + for _, step := range summary.Steps { + if step.Status != "ok" { + t.Fatalf("step %#v, want ok", step) + } + } + if len(summary.CleanupResults) != 2 { + t.Fatalf("cleanup results = %#v, want contact and site cleanup", summary.CleanupResults) + } + wantCalls := []string{ + "GET /api/v1/health", + "GET /api/v1/me", + "POST /api/v1/sites", + "POST /api/v1/sites/910/trigger-now", + "GET /api/v1/sites/910/events", + "POST /api/v1/alert-contacts", + "POST /api/v1/alert-contacts/77/test", + "DELETE /api/v1/alert-contacts/77", + "DELETE /api/v1/sites/910", + } + if strings.Join(calls, "\n") != strings.Join(wantCalls, "\n") { + t.Fatalf("calls:\n%s\nwant:\n%s", strings.Join(calls, "\n"), strings.Join(wantCalls, "\n")) + } +} + +func TestRunAPISmokeWebhookExercise(t *testing.T) { + const webhookSecret = "whsec_TESTSMOKESECRET" + + fixture := newSmokeWebhookFixture(t) + defer fixture.Close() + + var ( + calls []string + triggerCalls int + registeredURL string + ) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + calls = append(calls, r.Method+" "+r.URL.Path) + if r.URL.Path != "/api/v1/health" && r.Header.Get("Authorization") != "Bearer token-123" { + t.Fatalf("missing auth for %s %s", r.Method, r.URL.Path) + } + switch { + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/health": + writeTestJSON(t, w, map[string]string{"status": "ok"}) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/me": + writeTestJSON(t, w, map[string]any{"consumer_name": "api-cli-test", "scope": "admin"}) + case r.Method == http.MethodPost && r.URL.Path == "/api/v1/sites": + writeTestStatusJSON(t, w, http.StatusCreated, map[string]any{"id": 910, "blog_id": 910}) + case r.Method == http.MethodPost && r.URL.Path == "/api/v1/sites/910/trigger-now": + triggerCalls++ + if triggerCalls == 2 { + postSignedSmokeWebhook(t, registeredURL, webhookSecret, []byte(`{"type":"event.opened","site_id":910}`)) + writeTestJSON(t, w, map[string]any{"result": map[string]any{"success": false, "http_code": 500}}) + return + } + writeTestJSON(t, w, map[string]any{"result": map[string]any{"success": true}}) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/sites/910/events" && r.URL.RawQuery == "limit=5": + writeTestJSON(t, w, map[string]any{"data": []any{}, "page": map[string]any{"limit": 5}}) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/sites/910" && r.URL.Query().Get("include_cli_metadata") == "true": + writeTestJSON(t, w, map[string]any{"id": 910, "blog_id": 910, "cli_batch": "smoke-webhook"}) + case r.Method == http.MethodPost && r.URL.Path == "/api/v1/webhooks": + var body map[string]any + decodeTestJSON(t, r, &body) + if body["url"] != fixture.URL+"/webhook" { + t.Fatalf("webhook url = %#v", body["url"]) + } + if body["active"] != false { + t.Fatalf("webhook active = %#v, want false until secret is registered", body["active"]) + } + writeTestStatusJSON(t, w, http.StatusCreated, map[string]any{ + "id": 88, + "url": fixture.URL + "/webhook", + "active": false, + "events": []string{apiSmokeWebhookEvent}, + "secret_preview": "whsec_TEST...", + "secret": webhookSecret, + }) + case r.Method == http.MethodPatch && r.URL.Path == "/api/v1/webhooks/88": + var body map[string]any + decodeTestJSON(t, r, &body) + registeredURL = body["url"].(string) + if !strings.Contains(registeredURL, "secret="+webhookSecret) { + t.Fatalf("registered URL did not include fixture secret: %q", registeredURL) + } + if body["active"] != true { + t.Fatalf("webhook active = %#v, want true", body["active"]) + } + writeTestJSON(t, w, map[string]any{ + "id": 88, + "url": registeredURL, + "active": true, + "events": []string{apiSmokeWebhookEvent}, + "secret_preview": "whsec_TEST...", + }) + case r.Method == http.MethodPatch && r.URL.Path == "/api/v1/sites/910": + var body map[string]any + decodeTestJSON(t, r, &body) + if !strings.Contains(fmt.Sprint(body["monitor_url"]), "/status/500") { + t.Fatalf("monitor_url = %#v, want fixture failure URL", body["monitor_url"]) + } + writeTestJSON(t, w, map[string]any{"id": 910, "blog_id": 910}) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/sites/910/events" && r.URL.RawQuery == "active=true&limit=10": + writeTestJSON(t, w, map[string]any{ + "data": []any{ + map[string]any{"id": 321, "state": apiSmokeWebhookState, "severity": 3}, + }, + "page": map[string]any{"limit": 10}, + }) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/sites/910/events/321/transitions": + writeTestJSON(t, w, map[string]any{ + "data": []any{ + map[string]any{"id": 654, "event_id": 321, "reason": "opened", "state_after": apiSmokeWebhookState, "severity_after": 3}, + }, + "page": map[string]any{"limit": 50}, + }) + case r.Method == http.MethodGet && r.URL.Path == "/api/v1/webhooks/88/deliveries": + writeTestJSON(t, w, map[string]any{ + "data": []any{ + map[string]any{ + "id": 776, + "status": "delivered", + "event_id": 321, + "event_type": apiSmokeWebhookEvent, + "payload": map[string]any{"type": apiSmokeWebhookEvent, "site_id": 910}, + }, + map[string]any{ + "id": 777, + "status": "delivered", + "event_id": 321, + "event_type": apiSmokeWebhookEvent, + "payload": map[string]any{"type": apiSmokeWebhookEvent, "site_id": 910}, + }, + }, + "page": map[string]any{"limit": 10}, + }) + case r.Method == http.MethodDelete && r.URL.Path == "/api/v1/webhooks/88": + w.WriteHeader(http.StatusNoContent) + case r.Method == http.MethodDelete && r.URL.Path == "/api/v1/sites/910": + w.WriteHeader(http.StatusNoContent) + default: + t.Fatalf("unexpected request: %s %s?%s", r.Method, r.URL.Path, r.URL.RawQuery) + } + })) + defer srv.Close() + + var stdout bytes.Buffer + err := runAPISmoke(context.Background(), srv.Client(), apiCLIOptions{ + baseURL: srv.URL, + token: "token-123", + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + }, apiSmokeOptions{ + batch: "smoke-webhook", + blogID: 910, + url: "https://example.com/", + cleanup: true, + exercise: "webhook", + webhookURL: fixture.URL + "/webhook", + webhookRequestsURL: fixture.URL + "/webhook/requests", + webhookWait: 2 * time.Second, + webhookPollInterval: 10 * time.Millisecond, + fixtureURL: fixture.URL, + fixtureProbeURL: fixture.URL + "/health", + }) + if err != nil { + t.Fatalf("runAPISmoke() error = %v\nstdout=%s", err, stdout.String()) + } + + var summary apiSmokeSummary + if err := json.Unmarshal(stdout.Bytes(), &summary); err != nil { + t.Fatalf("unmarshal summary: %v\n%s", err, stdout.String()) + } + if summary.Webhook == nil || summary.Webhook.ID != 88 { + t.Fatalf("webhook summary = %#v, want webhook id 88", summary.Webhook) + } + if strings.Contains(summary.Webhook.URL, webhookSecret) { + t.Fatalf("webhook summary URL leaked raw secret: %q", summary.Webhook.URL) + } + if summary.WebhookFixture == nil || !summary.WebhookFixture.SignatureVerified { + t.Fatalf("fixture summary = %#v, want verified signature", summary.WebhookFixture) + } + if summary.FailureSimulation == nil || summary.FailureSimulation.TransitionCount != 1 { + t.Fatalf("failure simulation = %#v, want one transition", summary.FailureSimulation) + } + if len(summary.CleanupResults) != 2 { + t.Fatalf("cleanup results = %#v, want webhook and site cleanup", summary.CleanupResults) + } + + wantCalls := []string{ + "GET /api/v1/health", + "GET /api/v1/me", + "POST /api/v1/sites", + "POST /api/v1/sites/910/trigger-now", + "GET /api/v1/sites/910/events", + "POST /api/v1/webhooks", + "PATCH /api/v1/webhooks/88", + "GET /api/v1/sites/910", + "PATCH /api/v1/sites/910", + "POST /api/v1/sites/910/trigger-now", + "GET /api/v1/sites/910/events", + "GET /api/v1/sites/910/events/321/transitions", + "GET /api/v1/webhooks/88/deliveries", + "DELETE /api/v1/webhooks/88", + "DELETE /api/v1/sites/910", + } + if strings.Join(calls, "\n") != strings.Join(wantCalls, "\n") { + t.Fatalf("calls:\n%s\nwant:\n%s", strings.Join(calls, "\n"), strings.Join(wantCalls, "\n")) + } +} + +func TestRunAPISmokeWritesFailureSummary(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/api/v1/health": + writeTestJSON(t, w, map[string]string{"status": "ok"}) + case "/api/v1/me": + writeTestStatusJSON(t, w, http.StatusUnauthorized, map[string]string{"error": "missing token"}) + default: + t.Fatalf("unexpected request: %s", r.URL.Path) + } + })) + defer srv.Close() + + var stdout bytes.Buffer + err := runAPISmoke(context.Background(), srv.Client(), apiCLIOptions{ + baseURL: srv.URL, + timeout: time.Second, + out: &stdout, + errOut: ioDiscard{}, + }, apiSmokeOptions{ + batch: "smoke-failure", + blogID: 911, + cleanup: true, + exercise: "none", + }) + if err == nil { + t.Fatal("runAPISmoke() error = nil, want auth failure") + } + var summary apiSmokeSummary + if err := json.Unmarshal(stdout.Bytes(), &summary); err != nil { + t.Fatalf("unmarshal summary: %v\n%s", err, stdout.String()) + } + if len(summary.Steps) != 2 { + t.Fatalf("steps = %#v, want health + failed me", summary.Steps) + } + if summary.Steps[1].Name != "me" || summary.Steps[1].Status != "failed" { + t.Fatalf("failed step = %#v, want me failed", summary.Steps[1]) + } +} + +func TestRedactAPISecretError(t *testing.T) { + err := redactAPISecretError( + fmt.Errorf(`PATCH /api/v1/webhooks/88 returned 400 Bad Request: {"url":"http://api-fixture:8091/webhook?secret=whsec_TEST"}`), + "whsec_TEST", + ) + if err == nil { + t.Fatal("redactAPISecretError() = nil, want error") + } + if strings.Contains(err.Error(), "whsec_TEST") { + t.Fatalf("redactAPISecretError() leaked secret: %v", err) + } + if !strings.Contains(err.Error(), "secret=redacted") { + t.Fatalf("redactAPISecretError() = %v, want redacted query value", err) + } +} + +func TestAPIDeliveredWebhookRowsIncludeSiteRequiresExpectedDeliveryID(t *testing.T) { + body := json.RawMessage(`{ + "data": [ + {"id": 776, "status": "delivered", "payload": {"type": "event.opened", "site_id": 910}}, + {"id": 778, "status": "delivered", "payload": {"type": "event.opened", "site_id": 911}} + ] + }`) + if apiDeliveredWebhookRowsIncludeSite(body, 910, "777") { + t.Fatal("apiDeliveredWebhookRowsIncludeSite() = true for wrong delivery id") + } + if !apiDeliveredWebhookRowsIncludeSite(body, 910, "776") { + t.Fatal("apiDeliveredWebhookRowsIncludeSite() = false for expected delivery id") + } +} + +func TestAPICLIBatchBlogIDStartStable(t *testing.T) { + first := apiCLIBatchBlogIDStart("batch-a") + second := apiCLIBatchBlogIDStart("batch-a") + if first != second { + t.Fatalf("batch id start not stable: %d != %d", first, second) + } + if first < 910000000 || first >= 1000000000 { + t.Fatalf("batch id start = %d, want high local-test range", first) + } +} + +type smokeWebhookFixture struct { + *httptest.Server + mu sync.Mutex + requests []apiSmokeFixtureWebhookHit +} + +func newSmokeWebhookFixture(t *testing.T) *smokeWebhookFixture { + t.Helper() + fixture := &smokeWebhookFixture{} + fixture.Server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/health": + writeTestJSON(t, w, map[string]string{"status": "ok"}) + case r.Method == http.MethodDelete && r.URL.Path == "/webhook/requests": + fixture.mu.Lock() + fixture.requests = nil + fixture.mu.Unlock() + w.WriteHeader(http.StatusNoContent) + case r.Method == http.MethodGet && r.URL.Path == "/webhook/requests": + fixture.mu.Lock() + requests := append([]apiSmokeFixtureWebhookHit(nil), fixture.requests...) + fixture.mu.Unlock() + writeTestJSON(t, w, map[string]any{"count": len(requests), "requests": requests}) + case r.Method == http.MethodPost && r.URL.Path == "/webhook": + body, err := io.ReadAll(r.Body) + if err != nil { + t.Fatalf("read webhook body: %v", err) + } + valid := smokeTestSignatureValid(r.Header.Get("X-Jetmon-Signature"), body, r.URL.Query().Get("secret")) + fixture.mu.Lock() + fixture.requests = append(fixture.requests, apiSmokeFixtureWebhookHit{ + ID: len(fixture.requests) + 1, + Event: r.Header.Get("X-Jetmon-Event"), + Delivery: r.Header.Get("X-Jetmon-Delivery"), + Signature: r.Header.Get("X-Jetmon-Signature"), + SignatureValid: &valid, + Body: string(body), + }) + fixture.mu.Unlock() + w.WriteHeader(http.StatusNoContent) + default: + t.Fatalf("unexpected fixture request: %s %s", r.Method, r.URL.Path) + } + })) + return fixture +} + +func postSignedSmokeWebhook(t *testing.T, target, secret string, body []byte) { + t.Helper() + req, err := http.NewRequest(http.MethodPost, target, bytes.NewReader(body)) + if err != nil { + t.Fatalf("build webhook request: %v", err) + } + req.Header.Set("X-Jetmon-Event", apiSmokeWebhookEvent) + req.Header.Set("X-Jetmon-Delivery", "777") + req.Header.Set("X-Jetmon-Signature", smokeTestSignature(1700000000, body, secret)) + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("post webhook: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusNoContent { + t.Fatalf("post webhook status = %s", resp.Status) + } +} + +func smokeTestSignature(ts int64, body []byte, secret string) string { + mac := hmac.New(sha256.New, []byte(secret)) + _, _ = mac.Write([]byte(fmt.Sprintf("%d.", ts))) + _, _ = mac.Write(body) + return fmt.Sprintf("t=%d,v1=%s", ts, hex.EncodeToString(mac.Sum(nil))) +} + +func smokeTestSignatureValid(signature string, body []byte, secret string) bool { + return signature == smokeTestSignature(1700000000, body, secret) +} + +func decodeTestJSON(t *testing.T, r *http.Request, v any) { + t.Helper() + if err := json.NewDecoder(r.Body).Decode(v); err != nil { + t.Fatalf("decode request body: %v", err) + } +} + +func writeTestJSON(t *testing.T, w http.ResponseWriter, v any) { + t.Helper() + writeTestStatusJSON(t, w, http.StatusOK, v) +} + +func writeTestStatusJSON(t *testing.T, w http.ResponseWriter, status int, v any) { + t.Helper() + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + if err := json.NewEncoder(w).Encode(v); err != nil { + t.Fatalf("encode response: %v", err) + } +} diff --git a/cmd/jetmon2/main.go b/cmd/jetmon2/main.go new file mode 100644 index 00000000..0ff13d0a --- /dev/null +++ b/cmd/jetmon2/main.go @@ -0,0 +1,1421 @@ +package main + +import ( + "context" + "database/sql" + "flag" + "fmt" + "io" + "log" + "net" + "net/http" + "os" + "os/signal" + "path/filepath" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "syscall" + "time" + + "github.com/Automattic/jetmon/internal/alerting" + "github.com/Automattic/jetmon/internal/api" + "github.com/Automattic/jetmon/internal/apikeys" + "github.com/Automattic/jetmon/internal/audit" + "github.com/Automattic/jetmon/internal/checker" + "github.com/Automattic/jetmon/internal/config" + "github.com/Automattic/jetmon/internal/dashboard" + "github.com/Automattic/jetmon/internal/db" + "github.com/Automattic/jetmon/internal/deliverer" + "github.com/Automattic/jetmon/internal/fleethealth" + "github.com/Automattic/jetmon/internal/metrics" + "github.com/Automattic/jetmon/internal/orchestrator" + "github.com/Automattic/jetmon/internal/processmetrics" + "github.com/Automattic/jetmon/internal/veriflier" + "github.com/Automattic/jetmon/internal/wpcom" +) + +const processHealthWriteTimeout = 2 * time.Second + +// Injected at build time via -ldflags. +var ( + version = "dev" + buildDate = "unknown" + goVersion = "unknown" +) + +func main() { + if len(os.Args) < 2 { + runServe() + return + } + + if isVersionCommand(os.Args[1]) { + printVersion(os.Stdout) + return + } + + switch os.Args[1] { + case "migrate": + cmdMigrate() + case "validate-config": + cmdValidateConfig() + case "status": + cmdStatus() + case "audit": + cmdAudit() + case "drain": + cmdDrain() + case "reload": + cmdReload() + case "keys": + cmdKeys(os.Args[2:]) + case "api": + cmdAPI(os.Args[2:]) + case "site-tenants": + cmdSiteTenants(os.Args[2:]) + case "telemetry": + cmdTelemetry(os.Args[2:]) + case "verifliers": + cmdVerifliers(os.Args[2:]) + case "rollout": + cmdRollout(os.Args[2:]) + default: + runServe() + } +} + +func isVersionCommand(arg string) bool { + switch arg { + case "version", "--version", "-version": + return true + default: + return false + } +} + +func printVersion(w io.Writer) { + fmt.Fprintf(w, "jetmon2 %s (built %s with %s)\n", version, buildDate, goVersion) +} + +// runServe is the main entry point for the monitoring service. +func runServe() { + configPath := envOrDefault("JETMON_CONFIG", "config/config.json") + + if err := config.Load(configPath); err != nil { + log.Fatalf("load config: %v", err) + } + cfg := config.Get() + if err := checker.ConfigureResolverServers(cfg.CheckDNSResolvers); err != nil { + log.Fatalf("configure check DNS resolvers: %v", err) + } + log.Printf("config: legacy_status_projection=%s", enabledLabel(cfg.LegacyStatusProjectionEnable)) + log.Printf("config: bucket_ownership=%s", bucketOwnershipLabel(cfg)) + log.Printf("config: scheduler=%s", schedulerConfigLabel(cfg)) + log.Printf("config: default_check_policy=method:%s profile:%s", cfg.DefaultCheckMethod, cfg.DefaultDetectionProfile) + log.Printf("config: check_dns_resolvers=%s", checkDNSResolversLabel(checker.ConfiguredResolverServers())) + log.Printf("config: wpcom_notify=%s", enabledLabel(cfg.WPCOMNotifyEnable)) + log.Printf("config: email_transport=%s", emailTransportLabel(cfg)) + if !emailTransportDelivers(cfg) { + log.Printf("WARN: email_transport=%s — alert-contact emails will be logged but not delivered", emailTransportLabel(cfg)) + } + if cfg.DashboardPort > 0 { + if msg := dashboardBindWarning(cfg.DashboardBindAddr); msg != "" { + log.Printf("WARN: %s", msg) + } + } + + config.LoadDB() + if err := db.ConnectWithRetry(10); err != nil { + log.Fatalf("db connect: %v", err) + } + + pidPath := envOrDefault("JETMON_PID_FILE", "/run/jetmon2/jetmon2.pid") + if err := writePIDFile(pidPath); err != nil { + log.Printf("warning: could not write PID file %s: %v", pidPath, err) + } else { + defer removePIDFile(pidPath) + } + + audit.Init(db.DB()) + + if err := metrics.Init("statsd:8125", db.Hostname()); err != nil { + log.Printf("warning: statsd init failed: %v", err) + } + + hostname := db.Hostname() + processStartedAt := time.Now().UTC() + processID := fleethealth.ProcessID(hostname, fleethealth.ProcessMonitor) + + wp := wpcom.New(cfg.AuthToken, hostname) + + orch := orchestrator.New(cfg, wp) + if err := orch.ClaimBuckets(); err != nil { + log.Fatalf("claim buckets: %v", err) + } + + var dash *dashboard.Server + if cfg.DashboardPort > 0 { + dash = dashboard.New(hostname) + dash.SetFleetSource(newFleetDashboardStore(cfg)) + go func() { + addr := dashboardListenAddr(cfg) + if err := dash.Listen(addr); err != nil { + log.Printf("dashboard: %v", err) + } + }() + } + + // pprof on localhost only — never expose this on a public interface. + if cfg.DebugPort > 0 { + go func() { + addr := fmt.Sprintf("127.0.0.1:%d", cfg.DebugPort) + if err := dashboard.ListenDebug(addr); err != nil { + log.Printf("debug server: %v", err) + } + }() + } + + // Internal API server. Disabled when API_PORT is 0. Bears auth via + // jetmon_api_keys; key management is CLI-only (`./jetmon2 keys`). + var apiSrv *api.Server + if cfg.APIPort > 0 { + apiSrv = api.New(fmt.Sprintf(":%d", cfg.APIPort), db.DB(), hostname) + go func() { + if err := apiSrv.Listen(); err != nil && !api.IsServerClosed(err) { + log.Printf("api: %v", err) + } + }() + } + + if level, msg := deliveryOwnerStatus(cfg, hostname); msg != "" { + if level == "WARN" { + log.Printf("WARN: %s", msg) + } else { + log.Printf("config: %s", msg) + } + } + deliveryWorkersEnabled := deliveryWorkersShouldStart(cfg, hostname) + + var alertDispatchers map[alerting.Transport]alerting.Dispatcher + if cfg.APIPort > 0 { + alertDispatchers = deliverer.BuildAlertDispatchers(cfg) + if apiSrv != nil { + apiSrv.SetAlertDispatchers(alertDispatchers) + } + } + + // Embedded outbound delivery workers. Disabled when API_PORT is 0 + // (no API to manage webhooks or alert contacts) or when + // DELIVERY_OWNER_HOST names another host. + var deliveryRuntime *deliverer.Runtime + if deliveryWorkersEnabled { + deliveryRuntime = deliverer.Start(deliverer.Config{ + DB: db.DB(), + InstanceID: hostname, + Dispatchers: alertDispatchers, + }) + } + + var healthMu sync.RWMutex + var publishMu sync.Mutex + var shuttingDown atomic.Bool + var lastHealth []dashboard.HealthEntry + publishHostSnapshot := func(state string, refreshDependencies bool) { + publishMu.Lock() + defer publishMu.Unlock() + if shuttingDown.Load() && state == fleethealth.StateRunning { + return + } + currentCfg := config.Get() + if currentCfg == nil { + currentCfg = cfg + } + checkedAt := time.Now().UTC() + var health []dashboard.HealthEntry + if refreshDependencies { + health = dashboardHealthEntries(context.Background(), currentCfg, db.DB(), wp, metrics.Global() != nil, checkedAt) + healthMu.Lock() + lastHealth = append([]dashboard.HealthEntry(nil), health...) + healthMu.Unlock() + } else { + healthMu.RLock() + health = append([]dashboard.HealthEntry(nil), lastHealth...) + healthMu.RUnlock() + } + bMin, bMax := orch.BucketRange() + sitesPerSec, roundDuration := orch.LastRoundStats() + mem := processmetrics.CurrentMemory() + deliveryConfigEligible := deliveryWorkersShouldStart(currentCfg, hostname) + st := dashboard.State{ + WorkerCount: orch.WorkerCount(), + ActiveChecks: orch.ActiveChecks(), + QueueDepth: orch.QueueDepth(), + RetryQueueSize: orch.RetryQueueSize(), + SitesPerSec: sitesPerSec, + RoundDurationMs: roundDuration.Milliseconds(), + WPCOMCircuitOpen: wp.IsCircuitOpen(), + WPCOMQueueDepth: wp.QueueDepth(), + GoSysMemMB: mem.GoSysMemMB, + RSSMemMB: mem.RSSMemMB, + BucketMin: bMin, + BucketMax: bMax, + BucketOwnership: bucketOwnershipLabel(currentCfg), + LegacyStatusProjectionEnabled: currentCfg.LegacyStatusProjectionEnable, + DeliveryWorkersEnabled: deliveryWorkersEnabled, + DeliveryConfigEligible: deliveryConfigEligible, + DeliveryOwnerHost: currentCfg.DeliveryOwnerHost, + RolloutPreflightCommand: rolloutPreflightCommand(currentCfg), + RolloutCutoverCommand: cutoverCheckCommand(currentCfg), + RolloutActivityCommand: rolloutActivityCommand(), + RolloutRollbackCommand: rollbackCheckCommand(currentCfg), + RolloutStateReportCommand: stateReportCommand(), + ProjectionDriftCommand: projectionDriftCommand(), + } + st.Hostname = hostname + st.UpdatedAt = checkedAt + if dash != nil { + if refreshDependencies { + dash.UpdateHealth(health) + } + dash.Update(st) + } + ctx, cancel := context.WithTimeout(context.Background(), processHealthWriteTimeout) + if err := fleethealth.Upsert(ctx, db.DB(), monitorProcessHealthSnapshot(hostname, processStartedAt, state, currentCfg, st, health)); err != nil { + log.Printf("process health: %v", err) + } + cancel() + } + + // Publish both host-dashboard state and the durable fleet-health heartbeat. + publishHostSnapshot(fleethealth.StateRunning, false) + stopHostPublisher := make(chan struct{}) + var stopHostPublisherOnce sync.Once + go func() { + ticker := time.NewTicker(time.Duration(cfg.StatsUpdateIntervalMS) * time.Millisecond) + defer ticker.Stop() + publishHostSnapshot(fleethealth.StateRunning, true) + for { + select { + case <-ticker.C: + publishHostSnapshot(fleethealth.StateRunning, true) + case <-stopHostPublisher: + return + } + } + }() + + // Signal handling. + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP) + + go func() { + for sig := range sigCh { + switch sig { + case syscall.SIGHUP: + log.Println("received SIGHUP, reloading config") + if err := config.Reload(); err != nil { + log.Printf("config reload failed: %v", err) + } else { + if dash != nil { + dash.SetFleetSource(newFleetDashboardStore(config.Get())) + } + log.Println("config reloaded; CHECK_DNS_RESOLVERS changes require restart") + } + case syscall.SIGINT, syscall.SIGTERM: + log.Println("received shutdown signal, draining") + shuttingDown.Store(true) + stopHostPublisherOnce.Do(func() { close(stopHostPublisher) }) + publishHostSnapshot(fleethealth.StateStopping, false) + if apiSrv != nil { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + if err := apiSrv.Shutdown(ctx); err != nil { + log.Printf("api: shutdown error: %v", err) + } + cancel() + } + if deliveryRuntime != nil { + deliveryRuntime.Stop() + } + orch.Stop() + // Hard kill if drain takes too long (e.g. a stalled HTTP check). + time.AfterFunc(30*time.Second, func() { + log.Println("jetmon2: shutdown timeout exceeded, forcing exit") + os.Exit(1) + }) + } + } + }() + + orch.Run() + shuttingDown.Store(true) + stopHostPublisherOnce.Do(func() { close(stopHostPublisher) }) + publishHostSnapshot(fleethealth.StateStopping, false) + ctx, cancel := context.WithTimeout(context.Background(), processHealthWriteTimeout) + if err := fleethealth.MarkStopped(ctx, db.DB(), processID, time.Now().UTC()); err != nil { + log.Printf("process health: %v", err) + } + cancel() + log.Println("jetmon2: shutdown complete") +} + +func cmdMigrate() { + config.LoadDB() + if err := db.ConnectWithRetry(5); err != nil { + log.Fatalf("db connect: %v", err) + } + if err := db.Migrate(); err != nil { + log.Fatalf("migrate: %v", err) + } + fmt.Println("migrations applied successfully") +} + +func cmdValidateConfig() { + configPath := envOrDefault("JETMON_CONFIG", "config/config.json") + if err := config.Load(configPath); err != nil { + fmt.Fprintf(os.Stderr, "FAIL config parse: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS config parse") + + config.LoadDB() + if err := db.ConnectWithRetry(3); err != nil { + fmt.Fprintf(os.Stderr, "FAIL db connect: %v\n", err) + os.Exit(1) + } + fmt.Println("PASS db connect") + + cfg := config.Get() + fmt.Printf("INFO legacy_status_projection=%s\n", enabledLabel(cfg.LegacyStatusProjectionEnable)) + fmt.Printf("INFO bucket_ownership=%s\n", bucketOwnershipLabel(cfg)) + fmt.Printf("INFO scheduler=%s\n", schedulerConfigLabel(cfg)) + fmt.Printf("INFO default_check_policy=method:%s profile:%s\n", cfg.DefaultCheckMethod, cfg.DefaultDetectionProfile) + fmt.Printf("INFO wpcom_notify=%s\n", enabledLabel(cfg.WPCOMNotifyEnable)) + for _, line := range rolloutAdviceLines(cfg) { + fmt.Println(line) + } + fmt.Printf("INFO email_transport=%s\n", emailTransportLabel(cfg)) + if !emailTransportDelivers(cfg) { + fmt.Printf("WARN email_transport=%s — alert-contact emails will be logged but not delivered\n", emailTransportLabel(cfg)) + } + if cfg.DashboardPort > 0 { + if msg := dashboardBindWarning(cfg.DashboardBindAddr); msg != "" { + fmt.Printf("WARN %s\n", msg) + } + } + if level, msg := deliveryOwnerStatus(cfg, db.Hostname()); msg != "" { + fmt.Printf("%s %s\n", level, msg) + } + readiness := probeConfiguredVerifliers(context.Background(), cfg, dashboardHealthTimeout) + readinessLines, readinessFailed := renderVeriflierReadiness(readiness) + for _, line := range readinessLines { + fmt.Println(line) + } + discoverySnapshot, discoveryErr := veriflierDiscoverySnapshotForConfig(context.Background(), cfg) + discoveryLines, discoveryFailed := renderVeriflierDiscoveryReadiness(cfg.VeriflierDiscoveryModeOrDefault(), discoverySnapshot, discoveryErr, readiness) + for _, line := range discoveryLines { + fmt.Println(line) + } + if readinessFailed || discoveryFailed { + os.Exit(1) + } + + fmt.Println("\nvalidation passed") +} + +type veriflierReadinessResult struct { + Name string + Addr string + Status *veriflier.StatusV2Response + Err error + Latency time.Duration +} + +func probeConfiguredVerifliers(ctx context.Context, cfg *config.Config, timeout time.Duration) []veriflierReadinessResult { + if cfg == nil || len(cfg.Verifiers) == 0 { + return nil + } + if ctx == nil { + ctx = context.Background() + } + out := make([]veriflierReadinessResult, 0, len(cfg.Verifiers)) + for i, v := range cfg.Verifiers { + name := configuredVeriflierName(v, i) + addr := fmt.Sprintf("%s:%s", v.Host, v.TransportPort()) + result := veriflierReadinessResult{Name: name, Addr: addr} + if v.Host == "" || v.TransportPort() == "" { + result.Err = fmt.Errorf("host or port is not configured") + out = append(out, result) + continue + } + + probeCtx, cancel := context.WithTimeout(ctx, timeout) + start := time.Now() + status, err := veriflier.NewVeriflierClient(addr, v.AuthToken).Status(probeCtx) + cancel() + result.Latency = time.Since(start) + result.Status = status + result.Err = err + out = append(out, result) + } + return out +} + +func renderVeriflierReadiness(results []veriflierReadinessResult) ([]string, bool) { + if len(results) == 0 { + return nil, false + } + vantageCounts := duplicateVantageCounts(results) + lines := make([]string, 0, len(results)*2) + failed := false + for _, result := range results { + lines = append(lines, fmt.Sprintf("INFO veriflier %q at %s", result.Name, result.Addr)) + if result.Err != nil { + lines = append(lines, fmt.Sprintf("WARN veriflier_status name=%q addr=%q error=%q", result.Name, result.Addr, result.Err.Error())) + continue + } + if result.Status == nil { + lines = append(lines, fmt.Sprintf("WARN veriflier_status name=%q addr=%q error=%q", result.Name, result.Addr, "empty status response")) + continue + } + if !statusSupportsProtocol(result.Status, veriflier.ProtocolV2) { + lines = append(lines, fmt.Sprintf("WARN veriflier_contract name=%q addr=%q protocol=%s version=%q", result.Name, result.Addr, veriflier.ProtocolLegacy, result.Status.Version)) + continue + } + vantageID := strings.TrimSpace(result.Status.Vantage.ID) + if vantageID == "" { + failed = true + lines = append(lines, fmt.Sprintf("FAIL veriflier_vantage_missing name=%q addr=%q", result.Name, result.Addr)) + continue + } + if vantageCounts[vantageID] > 1 { + failed = true + lines = append(lines, fmt.Sprintf("FAIL veriflier_vantage_duplicate id=%q name=%q addr=%q", vantageID, result.Name, result.Addr)) + continue + } + lines = append(lines, fmt.Sprintf("PASS veriflier_contract name=%q addr=%q protocol=%s vantage_id=%q agent_id=%q capacity=%q", + result.Name, result.Addr, veriflier.ProtocolV2, vantageID, result.Status.Agent.ID, verifierCapacitySummary(result.Status.Capacity))) + } + return lines, failed +} + +func veriflierDiscoverySnapshotForConfig(ctx context.Context, cfg *config.Config) (db.VeriflierDiscoverySnapshot, error) { + if cfg == nil || cfg.VeriflierDiscoveryModeOrDefault() == config.VeriflierDiscoveryModeStatic { + return db.VeriflierDiscoverySnapshot{}, nil + } + queryCtx, cancel := context.WithTimeout(ctx, dashboardHealthTimeout) + defer cancel() + return db.ListVeriflierDiscoverySnapshot(queryCtx, db.VeriflierDiscoveryDefaultStaleAfter) +} + +func renderVeriflierDiscoveryReadiness(mode string, snapshot db.VeriflierDiscoverySnapshot, err error, staticResults []veriflierReadinessResult) ([]string, bool) { + mode = (&config.Config{VeriflierDiscoveryMode: mode}).VeriflierDiscoveryModeOrDefault() + if mode == config.VeriflierDiscoveryModeStatic { + return []string{"INFO veriflier_discovery=static"}, false + } + + failed := false + if err != nil { + line := fmt.Sprintf("WARN veriflier_discovery mode=%s error=%q", mode, err.Error()) + if mode == config.VeriflierDiscoveryModeActive { + line = fmt.Sprintf("FAIL veriflier_discovery mode=%s error=%q", mode, err.Error()) + failed = true + } + return []string{line}, failed + } + + enabled, usable := 0, 0 + for _, vantage := range snapshot.Vantages { + if !vantage.Enabled { + continue + } + enabled++ + if vantage.Usable() { + usable++ + } + } + + lines := []string{fmt.Sprintf( + "INFO veriflier_discovery mode=%s enabled_vantages=%d usable_vantages=%d recent_agents=%d", + mode, enabled, usable, len(snapshot.Agents), + )} + for _, vantage := range snapshot.Vantages { + if !vantage.Enabled || vantage.Usable() { + continue + } + level := "WARN" + if mode == config.VeriflierDiscoveryModeActive { + level = "FAIL" + failed = true + } + lines = append(lines, fmt.Sprintf("%s veriflier_discovery_incomplete vantage_id=%q endpoint_host=%q endpoint_port=%q auth_token_present=%t", + level, vantage.VantageID, vantage.EndpointHost, vantage.EndpointPort, strings.TrimSpace(vantage.AuthToken) != "")) + } + if mode == config.VeriflierDiscoveryModeActive && usable == 0 { + lines = append(lines, "FAIL veriflier_discovery_active usable_vantages=0") + failed = true + } + if mode == config.VeriflierDiscoveryModeShadow { + lines = append(lines, veriflierDiscoveryDriftLines(snapshot, staticResults)...) + } + return lines, failed +} + +func veriflierDiscoveryDriftLines(snapshot db.VeriflierDiscoverySnapshot, staticResults []veriflierReadinessResult) []string { + staticVantages := make(map[string]struct{}) + for _, result := range staticResults { + if result.Err != nil || result.Status == nil || !statusSupportsProtocol(result.Status, veriflier.ProtocolV2) { + continue + } + id := strings.TrimSpace(result.Status.Vantage.ID) + if id != "" { + staticVantages[id] = struct{}{} + } + } + discovered := make(map[string]struct{}) + for _, vantage := range snapshot.Vantages { + if vantage.Enabled { + discovered[strings.TrimSpace(vantage.VantageID)] = struct{}{} + } + } + + var lines []string + for id := range discovered { + if id == "" { + continue + } + if _, ok := staticVantages[id]; !ok { + lines = append(lines, fmt.Sprintf("WARN veriflier_discovery_extra vantage_id=%q", id)) + } + } + for id := range staticVantages { + if _, ok := discovered[id]; !ok { + lines = append(lines, fmt.Sprintf("WARN veriflier_discovery_missing vantage_id=%q", id)) + } + } + sort.Strings(lines) + if len(lines) == 0 { + lines = append(lines, "PASS veriflier_discovery_shadow static_vantages_match_registry") + } + return lines +} + +func configuredVeriflierName(v config.VerifierConfig, index int) string { + if strings.TrimSpace(v.Name) != "" { + return v.Name + } + return fmt.Sprintf("veriflier-%d", index+1) +} + +func statusSupportsProtocol(status *veriflier.StatusV2Response, protocol string) bool { + if status == nil { + return false + } + for _, p := range status.Protocols { + if p == protocol { + return true + } + } + return false +} + +func duplicateVantageCounts(results []veriflierReadinessResult) map[string]int { + counts := make(map[string]int) + for _, result := range results { + if result.Err != nil || result.Status == nil || !statusSupportsProtocol(result.Status, veriflier.ProtocolV2) { + continue + } + vantageID := strings.TrimSpace(result.Status.Vantage.ID) + if vantageID == "" { + continue + } + counts[vantageID]++ + } + return counts +} + +func verifierCapacitySummary(c veriflier.Capacity) string { + return fmt.Sprintf("active=%d in_flight=%d max_concurrency=%d queue=%d/%d", + c.Active, c.InFlight, c.MaxConcurrency, c.QueueDepth, c.QueueCapacity) +} + +func enabledLabel(b bool) string { + if b { + return "enabled" + } + return "disabled" +} + +func checkDNSResolversLabel(servers []string) string { + if len(servers) == 0 { + return "system" + } + return "configured [" + strings.Join(servers, ",") + "]" +} + +func bucketOwnershipLabel(cfg *config.Config) string { + if min, max, ok := cfg.PinnedBucketRange(); ok { + return fmt.Sprintf("pinned range=%d-%d", min, max) + } + return "dynamic jetmon_hosts" +} + +func rolloutAdviceLines(cfg *config.Config) []string { + lines := []string{} + if _, _, ok := cfg.PinnedBucketRange(); ok { + lines = append(lines, "INFO rollout_static_plan="+staticPlanCheckCommand()) + } + lines = append(lines, + "INFO rollout_preflight="+rolloutPreflightCommand(cfg), + "INFO rollout_activity_check="+rolloutActivityCommand(), + ) + if cmd := cutoverCheckCommand(cfg); cmd != "" { + lines = append(lines, "INFO rollout_cutover_check="+cmd) + } + if cmd := rollbackCheckCommand(cfg); cmd != "" { + lines = append(lines, "INFO rollout_rollback_check="+cmd) + } + lines = append(lines, "INFO rollout_state_report="+stateReportCommand()) + lines = append(lines, "INFO rollout_drift_report="+projectionDriftCommand()) + return lines +} + +func staticPlanCheckCommand() string { + return "./jetmon2 rollout static-plan-check --file=" +} + +func rolloutPreflightCommand(cfg *config.Config) string { + if minBucket, maxBucket, ok := cfg.PinnedBucketRange(); ok { + cmd := fmt.Sprintf("./jetmon2 rollout host-preflight --file= --host= --runtime-host= --bucket-min=%d --bucket-max=%d", minBucket, maxBucket) + if cfg.BucketTotal > 0 { + cmd += fmt.Sprintf(" --bucket-total=%d", cfg.BucketTotal) + } + return cmd + } + return "./jetmon2 rollout dynamic-check" +} + +func rolloutActivityCommand() string { + return "./jetmon2 rollout activity-check --since=15m" +} + +func cutoverCheckCommand(cfg *config.Config) string { + if _, _, ok := cfg.PinnedBucketRange(); ok { + return "./jetmon2 rollout cutover-check --since=15m" + } + return "" +} + +func rollbackCheckCommand(cfg *config.Config) string { + if _, _, ok := cfg.PinnedBucketRange(); ok { + return "./jetmon2 rollout rollback-check" + } + return "" +} + +func projectionDriftCommand() string { + return "./jetmon2 rollout projection-drift" +} + +func stateReportCommand() string { + return "./jetmon2 rollout state-report --since=15m" +} + +func dashboardListenAddr(cfg *config.Config) string { + bindAddr := "127.0.0.1" + port := 0 + if cfg != nil { + if strings.TrimSpace(cfg.DashboardBindAddr) != "" { + bindAddr = strings.TrimSpace(cfg.DashboardBindAddr) + } + port = cfg.DashboardPort + } + return net.JoinHostPort(bindAddr, strconv.Itoa(port)) +} + +func dashboardBindWarning(bindAddr string) string { + bindAddr = strings.TrimSpace(bindAddr) + if bindAddr == "" { + bindAddr = "127.0.0.1" + } + host := strings.Trim(bindAddr, "[]") + host = strings.TrimSuffix(strings.ToLower(host), ".") + if host == "localhost" || strings.HasSuffix(host, ".localhost") { + return "" + } + if ip := net.ParseIP(host); ip != nil && ip.IsLoopback() { + return "" + } + return fmt.Sprintf("DASHBOARD_BIND_ADDR=%q exposes unauthenticated operator dashboards; restrict access to trusted operator networks", bindAddr) +} + +func newFleetDashboardStore(cfg *config.Config) *dashboard.FleetStore { + if cfg == nil { + cfg = config.Get() + } + bucketTotal := 0 + heartbeatGrace := 0 + if cfg != nil { + bucketTotal = cfg.BucketTotal + heartbeatGrace = cfg.BucketHeartbeatGraceSec + } + return dashboard.NewFleetStore(db.DB(), dashboard.FleetStoreOptions{ + BucketTotal: bucketTotal, + HeartbeatGrace: time.Duration(heartbeatGrace) * time.Second, + }) +} + +const dashboardHealthTimeout = 2 * time.Second + +func dashboardHealthEntries(ctx context.Context, cfg *config.Config, sqlDB *sql.DB, wp *wpcom.Client, statsdReady bool, checkedAt time.Time) []dashboard.HealthEntry { + entries := []dashboard.HealthEntry{ + mysqlHealthEntry(ctx, sqlDB, checkedAt), + wpcomHealthEntry(wp, checkedAt), + statsdHealthEntry(statsdReady, checkedAt), + diskHealthEntry("logs", checkedAt), + diskHealthEntry("stats", checkedAt), + } + entries = append(entries, veriflierHealthEntries(ctx, cfg, checkedAt)...) + if entry, ok := veriflierDiscoveryHealthEntry(ctx, cfg, checkedAt); ok { + entries = append(entries, entry) + } + return entries +} + +func monitorProcessHealthSnapshot(hostname string, startedAt time.Time, state string, cfg *config.Config, st dashboard.State, health []dashboard.HealthEntry) fleethealth.Snapshot { + if st.UpdatedAt.IsZero() { + st.UpdatedAt = time.Now().UTC() + } + bucketMin, bucketMax := st.BucketMin, st.BucketMax + apiPort, dashboardPort := cfg.APIPort, cfg.DashboardPort + healthStatus := dashboard.SummarizeHost(st, health).Status + if state == fleethealth.StateStopping || state == fleethealth.StateStopped { + healthStatus = fleethealth.HealthAmber + } + return fleethealth.Snapshot{ + HostID: hostname, + ProcessType: fleethealth.ProcessMonitor, + PID: os.Getpid(), + Version: version, + BuildDate: buildDate, + GoVersion: goVersion, + State: state, + HealthStatus: healthStatus, + StartedAt: startedAt, + UpdatedAt: time.Now().UTC(), + BucketMin: &bucketMin, + BucketMax: &bucketMax, + BucketOwnership: st.BucketOwnership, + APIPort: &apiPort, + DashboardPort: &dashboardPort, + DeliveryWorkersEnabled: st.DeliveryWorkersEnabled, + DeliveryOwnerHost: st.DeliveryOwnerHost, + WorkerCount: st.WorkerCount, + ActiveChecks: st.ActiveChecks, + QueueDepth: st.QueueDepth, + RetryQueueSize: st.RetryQueueSize, + WPCOMCircuitOpen: st.WPCOMCircuitOpen, + WPCOMQueueDepth: st.WPCOMQueueDepth, + GoSysMemMB: st.GoSysMemMB, + RSSMemMB: st.RSSMemMB, + DependencyHealth: dashboardHealthToFleet(health), + } +} + +func dashboardHealthToFleet(entries []dashboard.HealthEntry) []fleethealth.DependencyHealth { + out := make([]fleethealth.DependencyHealth, 0, len(entries)) + for _, entry := range entries { + out = append(out, fleethealth.DependencyHealth{ + Name: entry.Name, + Status: entry.Status, + LatencyMS: entry.Latency, + LastError: entry.LastError, + CheckedAt: entry.CheckedAt, + }) + } + return out +} + +func mysqlHealthEntry(ctx context.Context, sqlDB *sql.DB, checkedAt time.Time) dashboard.HealthEntry { + entry := dashboard.HealthEntry{Name: "mysql", CheckedAt: checkedAt} + if sqlDB == nil { + entry.Status = "red" + entry.LastError = "database pool is not initialized" + return entry + } + + pingCtx, cancel := context.WithTimeout(ctx, dashboardHealthTimeout) + defer cancel() + + start := time.Now() + if err := sqlDB.PingContext(pingCtx); err != nil { + entry.Status = "red" + entry.Latency = time.Since(start).Milliseconds() + entry.LastError = err.Error() + return entry + } + entry.Status = "green" + entry.Latency = time.Since(start).Milliseconds() + return entry +} + +func veriflierHealthEntries(ctx context.Context, cfg *config.Config, checkedAt time.Time) []dashboard.HealthEntry { + if cfg == nil || len(cfg.Verifiers) == 0 { + return []dashboard.HealthEntry{{ + Name: "verifliers", + Status: "amber", + LastError: "no verifliers configured", + CheckedAt: checkedAt, + }} + } + + results := probeConfiguredVerifliers(ctx, cfg, dashboardHealthTimeout) + vantageCounts := duplicateVantageCounts(results) + entries := make([]dashboard.HealthEntry, 0, len(results)) + for _, result := range results { + entry := dashboard.HealthEntry{ + Name: "veriflier:" + result.Name, + Latency: result.Latency.Milliseconds(), + CheckedAt: checkedAt, + } + if result.Err != nil { + entry.Status = "red" + entry.LastError = result.Err.Error() + entries = append(entries, entry) + continue + } + if result.Status == nil { + entry.Status = "red" + entry.LastError = "empty status response" + entries = append(entries, entry) + continue + } + if !statusSupportsProtocol(result.Status, veriflier.ProtocolV2) { + entry.Status = "amber" + entry.LastError = "legacy verifier status endpoint; v2 status metadata unavailable" + if result.Status.Version != "" { + entry.Name = fmt.Sprintf("%s (%s)", entry.Name, result.Status.Version) + } + entries = append(entries, entry) + continue + } + vantageID := strings.TrimSpace(result.Status.Vantage.ID) + if vantageID == "" { + entry.Status = "red" + entry.LastError = "v2 verifier status did not report a vantage id" + entries = append(entries, entry) + continue + } + if vantageCounts[vantageID] > 1 { + entry.Status = "red" + entry.LastError = fmt.Sprintf("duplicate v2 verifier vantage id %q", vantageID) + entries = append(entries, entry) + continue + } + entry.Status = "green" + entry.Name = fmt.Sprintf("%s (%s vantage=%s %s)", entry.Name, result.Status.Version, vantageID, verifierCapacitySummary(result.Status.Capacity)) + entries = append(entries, entry) + } + return entries +} + +func veriflierDiscoveryHealthEntry(ctx context.Context, cfg *config.Config, checkedAt time.Time) (dashboard.HealthEntry, bool) { + mode := cfg.VeriflierDiscoveryModeOrDefault() + if mode == config.VeriflierDiscoveryModeStatic { + return dashboard.HealthEntry{}, false + } + entry := dashboard.HealthEntry{Name: "veriflier-discovery", CheckedAt: checkedAt} + start := time.Now() + snapshot, err := veriflierDiscoverySnapshotForConfig(ctx, cfg) + entry.Latency = time.Since(start).Milliseconds() + if err != nil { + if mode == config.VeriflierDiscoveryModeActive { + entry.Status = "red" + } else { + entry.Status = "amber" + } + entry.LastError = err.Error() + return entry, true + } + enabled, usable := 0, 0 + for _, vantage := range snapshot.Vantages { + if !vantage.Enabled { + continue + } + enabled++ + if vantage.Usable() { + usable++ + } + } + entry.Name = fmt.Sprintf("veriflier-discovery:%s enabled=%d usable=%d agents=%d", mode, enabled, usable, len(snapshot.Agents)) + entry.Status = "green" + if mode == config.VeriflierDiscoveryModeActive && usable == 0 { + entry.Status = "red" + entry.LastError = "active discovery has no usable enabled vantages" + } else if mode == config.VeriflierDiscoveryModeShadow && enabled == 0 { + entry.Status = "amber" + entry.LastError = "shadow discovery registry has no enabled vantages" + } + return entry, true +} + +func wpcomHealthEntry(wp *wpcom.Client, checkedAt time.Time) dashboard.HealthEntry { + entry := dashboard.HealthEntry{Name: "wpcom", CheckedAt: checkedAt} + if wp == nil { + entry.Status = "red" + entry.LastError = "wpcom client is not initialized" + return entry + } + queueDepth := wp.QueueDepth() + if wp.IsCircuitOpen() { + entry.Status = "red" + entry.LastError = fmt.Sprintf("circuit open, queued notifications=%d", queueDepth) + return entry + } + if queueDepth > 0 { + entry.Status = "amber" + entry.LastError = fmt.Sprintf("queued notifications=%d", queueDepth) + return entry + } + entry.Status = "green" + return entry +} + +func statsdHealthEntry(ready bool, checkedAt time.Time) dashboard.HealthEntry { + entry := dashboard.HealthEntry{Name: "statsd", CheckedAt: checkedAt} + if !ready { + entry.Status = "amber" + entry.LastError = "statsd client is not initialized" + return entry + } + entry.Status = "green" + return entry +} + +func diskHealthEntry(dir string, checkedAt time.Time) dashboard.HealthEntry { + entry := dashboard.HealthEntry{Name: "disk:" + dir, CheckedAt: checkedAt} + if err := checkWritableDir(dir); err != nil { + entry.Status = "red" + entry.LastError = err.Error() + return entry + } + entry.Status = "green" + return entry +} + +func checkWritableDir(dir string) error { + info, err := os.Stat(dir) + if err != nil { + return err + } + if !info.IsDir() { + return fmt.Errorf("%s is not a directory", dir) + } + f, err := os.CreateTemp(dir, ".jetmon-health-*") + if err != nil { + return err + } + name := f.Name() + if err := f.Close(); err != nil { + _ = os.Remove(name) + return err + } + if err := os.Remove(name); err != nil { + return err + } + return nil +} + +// emailTransportLabel collapses an empty EMAIL_TRANSPORT to its compatibility +// alias ("stub") so startup output and validate-config show a single canonical +// name regardless of which form an operator wrote in config. +func emailTransportLabel(cfg *config.Config) string { + if cfg.EmailTransport == "" { + return "stub" + } + return cfg.EmailTransport +} + +// emailTransportDelivers reports whether the configured email transport +// actually delivers mail. The stub transport (and the empty-string alias for +// it) only logs, so any alert-contact configured with transport="email" will +// silently disappear into the logs in that mode. +func emailTransportDelivers(cfg *config.Config) bool { + return cfg.EmailTransport == "smtp" || cfg.EmailTransport == "wpcom" +} + +func schedulerConfigLabel(cfg *config.Config) string { + if cfg.SchedulerEngine == "streaming" { + return fmt.Sprintf( + "streaming reload=%s legacy_projection=%s worker_floor=%d fetch_page_size=%d", + time.Duration(cfg.StreamingTargetReloadSec)*time.Second, + time.Duration(cfg.StreamingLegacyProjectionIntervalMin)*time.Minute, + cfg.NumWorkers, + cfg.DatasetSize, + ) + } + if cfg.UseVariableCheckIntervals { + return fmt.Sprintf( + "variable_intervals fetch_page_size=%d idle_poll=%s", + cfg.DatasetSize, + orchestrator.VariableIntervalPollInterval(), + ) + } + return fmt.Sprintf( + "fixed_rounds fetch_page_size=%d min_round_interval=%s", + cfg.DatasetSize, + time.Duration(cfg.MinTimeBetweenRoundsSec)*time.Second, + ) +} + +func deliveryWorkersShouldStart(cfg *config.Config, hostname string) bool { + if cfg.APIPort <= 0 { + return false + } + owner := strings.TrimSpace(cfg.DeliveryOwnerHost) + return owner == "" || owner == hostname +} + +func deliveryOwnerStatus(cfg *config.Config, hostname string) (string, string) { + owner := strings.TrimSpace(cfg.DeliveryOwnerHost) + if cfg.APIPort <= 0 { + if owner == "" { + return "INFO", "delivery_workers=disabled api_port=disabled" + } + return "INFO", fmt.Sprintf("delivery_owner_host=%q ignored because API_PORT is disabled", owner) + } + if owner == "" { + return "WARN", fmt.Sprintf("delivery_owner_host is unset; host %q will run delivery workers because API_PORT is enabled", hostname) + } + if owner == hostname { + return "INFO", fmt.Sprintf("delivery_owner_host=%q matched; delivery workers enabled on this host", owner) + } + return "INFO", fmt.Sprintf("delivery_owner_host=%q; delivery workers disabled on host %q", owner, hostname) +} + +func cmdStatus() { + // Connect to the running instance's internal API. + port := envOrDefault("DASHBOARD_PORT", "8080") + host := envOrDefault("DASHBOARD_HOST", envOrDefault("DASHBOARD_BIND_ADDR", "localhost")) + if host == "0.0.0.0" || host == "::" { + host = "localhost" + } + resp, err := httpGet(fmt.Sprintf("http://%s/api/state", net.JoinHostPort(host, port))) + if err != nil { + log.Fatalf("status: %v", err) + } + fmt.Println(resp) +} + +func cmdAudit() { + fs := flag.NewFlagSet("audit", flag.ExitOnError) + blogID := fs.Int64("blog-id", 0, "blog ID to query") + since := fs.String("since", "", "start time (RFC3339 or duration like 24h)") + until := fs.String("until", "", "end time (RFC3339)") + _ = fs.Parse(os.Args[2:]) + + if *blogID == 0 { + fmt.Fprintln(os.Stderr, "usage: jetmon2 audit --blog-id [--since