Monitoring & Observability
Overview
This document outlines the monitoring strategy, tools, and practices for maintaining visibility into our SaaS CRM application's health and performance.
Monitoring Stack
Tools Overview
| Tool | Purpose | Metrics |
|---|---|---|
| Datadog | APM & Infrastructure | Application performance, traces |
| CloudWatch | AWS Resources | EC2, RDS, ALB metrics |
| Sentry | Error Tracking | Application errors, crashes |
| New Relic | Real User Monitoring | Frontend performance, user experience |
| Grafana | Visualization | Custom dashboards |
| Prometheus | Metrics Collection | Time-series data |
| ELK Stack | Log Management | Centralized logging |
| PagerDuty | Incident Management | Alert routing, on-call |
Application Performance Monitoring (APM)
Datadog Configuration
# datadog/config.yaml
api_key: ${DD_API_KEY}
site: datadoghq.com
hostname: ${HOSTNAME}
logs_enabled: true
process_config:
enabled: true
apm_config:
enabled: true
env: production
service_name: crm-app
analyzed_spans:
- service: mysql
operation: query
- service: redis
operation: command
- service: http
operation: request
filter_tags:
reject: ["password", "token", "secret"]
sampling_rules:
- service: crm-api
sample_rate: 0.1 # 10% sampling
- service: crm-frontend
sample_rate: 0.05 # 5% sampling
Frontend Monitoring
// plugins/monitoring.client.js
import { datadogRum } from '@datadog/browser-rum'
import * as Sentry from '@sentry/vue'
export default defineNuxtPlugin((nuxtApp) => {
// Datadog RUM
datadogRum.init({
applicationId: process.env.DD_APPLICATION_ID,
clientToken: process.env.DD_CLIENT_TOKEN,
site: 'datadoghq.com',
service: 'crm-frontend',
env: process.env.NODE_ENV,
version: process.env.APP_VERSION,
sessionSampleRate: 100,
trackInteractions: true,
defaultPrivacyLevel: 'mask-user-input'
})
// Sentry Error Tracking
Sentry.init({
app: nuxtApp.vueApp,
dsn: process.env.SENTRY_DSN,
environment: process.env.NODE_ENV,
tracesSampleRate: 0.1,
integrations: [
new Sentry.BrowserTracing(),
new Sentry.Replay({
maskAllText: true,
blockAllMedia: true
})
],
beforeSend(event, hint) {
// Filter sensitive data
if (event.request) {
delete event.request.cookies
delete event.request.headers
}
return event
}
})
// Custom performance monitoring
if (typeof window !== 'undefined') {
// Track page load performance
window.addEventListener('load', () => {
const perfData = performance.getEntriesByType('navigation')[0]
datadogRum.addTiming('page_load', perfData.loadEventEnd)
// Track Core Web Vitals
if ('PerformanceObserver' in window) {
// Largest Contentful Paint
new PerformanceObserver((entryList) => {
const entries = entryList.getEntries()
const lastEntry = entries[entries.length - 1]
datadogRum.addTiming('lcp', lastEntry.renderTime || lastEntry.loadTime)
}).observe({ type: 'largest-contentful-paint', buffered: true })
// First Input Delay
new PerformanceObserver((entryList) => {
const firstInput = entryList.getEntries()[0]
datadogRum.addTiming('fid', firstInput.processingStart - firstInput.startTime)
}).observe({ type: 'first-input', buffered: true })
// Cumulative Layout Shift
let clsValue = 0
new PerformanceObserver((entryList) => {
for (const entry of entryList.getEntries()) {
if (!entry.hadRecentInput) {
clsValue += entry.value
}
}
datadogRum.addTiming('cls', clsValue)
}).observe({ type: 'layout-shift', buffered: true })
}
})
}
})
Backend Monitoring
<?php
// app/middleware/MonitoringMiddleware.php
namespace App\Middleware;
use DDTrace\GlobalTracer;
use Prometheus\CollectorRegistry;
use Prometheus\Storage\Redis;
class MonitoringMiddleware
{
private $tracer;
private $metrics;
public function __construct()
{
$this->tracer = GlobalTracer::get();
$this->metrics = new CollectorRegistry(new Redis());
}
public function beforeExecuteRoute($dispatcher)
{
// Start APM trace
$span = $this->tracer->startActiveSpan('http.request');
$span->setTag('http.method', $_SERVER['REQUEST_METHOD']);
$span->setTag('http.url', $_SERVER['REQUEST_URI']);
$span->setTag('http.client_ip', $_SERVER['REMOTE_ADDR']);
// Record metrics
$requestCounter = $this->metrics->getOrRegisterCounter(
'crm',
'http_requests_total',
'Total HTTP requests',
['method', 'endpoint']
);
$requestCounter->inc([
$_SERVER['REQUEST_METHOD'],
$dispatcher->getControllerName() . '/' . $dispatcher->getActionName()
]);
// Start timing
$startTime = microtime(true);
$dispatcher->setParam('_monitoring_start', $startTime);
}
public function afterExecuteRoute($dispatcher)
{
$span = $this->tracer->getActiveSpan();
$startTime = $dispatcher->getParam('_monitoring_start');
$duration = microtime(true) - $startTime;
// Record response metrics
$response = $dispatcher->getReturnedValue();
$statusCode = $response->getStatusCode();
$span->setTag('http.status_code', $statusCode);
$span->setTag('http.duration', $duration);
// Record latency histogram
$latencyHistogram = $this->metrics->getOrRegisterHistogram(
'crm',
'http_request_duration_seconds',
'HTTP request latency',
['method', 'endpoint', 'status']
);
$latencyHistogram->observe($duration, [
$_SERVER['REQUEST_METHOD'],
$dispatcher->getControllerName() . '/' . $dispatcher->getActionName(),
$statusCode
]);
// Track errors
if ($statusCode >= 400) {
$errorCounter = $this->metrics->getOrRegisterCounter(
'crm',
'http_errors_total',
'Total HTTP errors',
['method', 'endpoint', 'status']
);
$errorCounter->inc([
$_SERVER['REQUEST_METHOD'],
$dispatcher->getControllerName() . '/' . $dispatcher->getActionName(),
$statusCode
]);
// Send to Sentry if 5xx error
if ($statusCode >= 500) {
\Sentry\captureMessage(
"HTTP {$statusCode} Error",
\Sentry\Severity::error()
);
}
}
$span->finish();
}
}
Infrastructure Monitoring
CloudWatch Metrics
# scripts/custom_metrics.py
import boto3
import psutil
import time
from datetime import datetime
cloudwatch = boto3.client('cloudwatch', region_name='us-east-1')
def send_custom_metrics():
"""Send custom metrics to CloudWatch"""
# System metrics
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
disk = psutil.disk_usage('/')
# Network metrics
net_io = psutil.net_io_counters()
# Application metrics
active_connections = get_active_connections()
queue_size = get_queue_size()
cache_hit_rate = get_cache_hit_rate()
metrics = [
{
'MetricName': 'CPUUtilization',
'Value': cpu_percent,
'Unit': 'Percent',
'Timestamp': datetime.utcnow()
},
{
'MetricName': 'MemoryUtilization',
'Value': memory.percent,
'Unit': 'Percent',
'Timestamp': datetime.utcnow()
},
{
'MetricName': 'DiskUtilization',
'Value': disk.percent,
'Unit': 'Percent',
'Timestamp': datetime.utcnow()
},
{
'MetricName': 'NetworkBytesIn',
'Value': net_io.bytes_recv,
'Unit': 'Bytes',
'Timestamp': datetime.utcnow()
},
{
'MetricName': 'NetworkBytesOut',
'Value': net_io.bytes_sent,
'Unit': 'Bytes',
'Timestamp': datetime.utcnow()
},
{
'MetricName': 'ActiveConnections',
'Value': active_connections,
'Unit': 'Count',
'Timestamp': datetime.utcnow()
},
{
'MetricName': 'QueueSize',
'Value': queue_size,
'Unit': 'Count',
'Timestamp': datetime.utcnow()
},
{
'MetricName': 'CacheHitRate',
'Value': cache_hit_rate,
'Unit': 'Percent',
'Timestamp': datetime.utcnow()
}
]
# Send metrics in batches
for i in range(0, len(metrics), 20):
batch = metrics[i:i+20]
cloudwatch.put_metric_data(
Namespace='CRM/Application',
MetricData=batch
)
print(f"Sent {len(metrics)} metrics to CloudWatch")
def get_active_connections():
"""Get number of active database connections"""
# Implementation depends on your database
return 42
def get_queue_size():
"""Get message queue size"""
# Implementation depends on your queue system
return 156
def get_cache_hit_rate():
"""Calculate cache hit rate"""
# Implementation depends on your cache system
return 92.5
if __name__ == "__main__":
while True:
send_custom_metrics()
time.sleep(60) # Send metrics every minute
Server Monitoring
# ansible/monitoring.yml
---
- name: Configure monitoring agents
hosts: all
become: yes
tasks:
- name: Install Node Exporter
get_url:
url: https://github.com/prometheus/node_exporter/releases/download/v1.5.0/node_exporter-1.5.0.linux-amd64.tar.gz
dest: /tmp/node_exporter.tar.gz
- name: Extract Node Exporter
unarchive:
src: /tmp/node_exporter.tar.gz
dest: /usr/local/bin
remote_src: yes
- name: Create Node Exporter service
copy:
content: |
[Unit]
Description=Node Exporter
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/bin/node_exporter
Restart=always
[Install]
WantedBy=multi-user.target
dest: /etc/systemd/system/node_exporter.service
- name: Start Node Exporter
systemd:
name: node_exporter
state: started
enabled: yes
- name: Install Datadog Agent
shell: |
DD_API_KEY="{{ datadog_api_key }}" \
DD_SITE="datadoghq.com" \
bash -c "$(curl -L https://s3.amazonaws.com/dd-agent/scripts/install_script.sh)"
- name: Configure Datadog
template:
src: datadog.yaml.j2
dest: /etc/datadog-agent/datadog.yaml
notify: restart datadog-agent
Database Monitoring
MySQL Monitoring Queries
-- Performance Schema Queries
-- Top slow queries
SELECT
digest_text,
count_star,
avg_timer_wait/1000000000 AS avg_time_ms,
sum_timer_wait/1000000000 AS total_time_ms
FROM performance_schema.events_statements_summary_by_digest
ORDER BY sum_timer_wait DESC
LIMIT 10;
-- Connection statistics
SELECT
variable_value AS current_connections
FROM performance_schema.global_status
WHERE variable_name = 'Threads_connected';
-- Table statistics
SELECT
table_schema,
table_name,
rows_read,
rows_inserted,
rows_updated,
rows_deleted
FROM performance_schema.table_io_waits_summary_by_table
WHERE table_schema = 'crm'
ORDER BY rows_read DESC;
-- Lock waits
SELECT
waiting_trx_id,
waiting_pid,
waiting_query,
blocking_trx_id,
blocking_pid,
blocking_query
FROM sys.innodb_lock_waits;
-- Buffer pool statistics
SELECT
page_type,
pool_id,
lru_position,
COUNT(*) AS page_count
FROM information_schema.innodb_buffer_page
GROUP BY page_type, pool_id
ORDER BY page_count DESC;
Database Monitoring Script
#!/bin/bash
# scripts/monitor_mysql.sh
MYSQL_HOST="localhost"
MYSQL_USER="monitoring"
MYSQL_PASS="$MYSQL_MONITORING_PASSWORD"
# Function to execute query and send to monitoring
function monitor_metric() {
local metric_name=$1
local query=$2
local value=$(mysql -h $MYSQL_HOST -u $MYSQL_USER -p$MYSQL_PASS -sN -e "$query")
# Send to Datadog
echo "mysql.$metric_name:$value|g" | nc -u -w0 127.0.0.1 8125
}
# Monitor connections
monitor_metric "connections.current" \
"SELECT variable_value FROM performance_schema.global_status WHERE variable_name = 'Threads_connected'"
# Monitor slow queries
monitor_metric "slow_queries.count" \
"SELECT variable_value FROM performance_schema.global_status WHERE variable_name = 'Slow_queries'"
# Monitor query cache hit rate
monitor_metric "query_cache.hit_rate" \
"SELECT (Qcache_hits / (Qcache_hits + Qcache_inserts + Qcache_not_cached)) * 100
FROM (SELECT
(SELECT variable_value FROM performance_schema.global_status WHERE variable_name = 'Qcache_hits') as Qcache_hits,
(SELECT variable_value FROM performance_schema.global_status WHERE variable_name = 'Qcache_inserts') as Qcache_inserts,
(SELECT variable_value FROM performance_schema.global_status WHERE variable_name = 'Qcache_not_cached') as Qcache_not_cached
) as cache_stats"
# Monitor replication lag (if slave)
monitor_metric "replication.lag_seconds" \
"SELECT Seconds_Behind_Master FROM SHOW SLAVE STATUS"
# Monitor table sizes
mysql -h $MYSQL_HOST -u $MYSQL_USER -p$MYSQL_PASS -sN -e "
SELECT
CONCAT('table.size.', table_name, ':', ROUND(data_length / 1024 / 1024, 2))
FROM information_schema.tables
WHERE table_schema = 'crm'
" | while read metric; do
echo "$metric|g" | nc -u -w0 127.0.0.1 8125
done
Log Management
ELK Stack Configuration
# elasticsearch/elasticsearch.yml
cluster.name: crm-logs
node.name: es-node-1
network.host: 0.0.0.0
discovery.seed_hosts: ["es-node-1", "es-node-2", "es-node-3"]
cluster.initial_master_nodes: ["es-node-1"]
# Index lifecycle management
xpack.ilm.enabled: true
# Security
xpack.security.enabled: true
xpack.security.transport.ssl.enabled: true
# logstash/pipeline/crm.conf
input {
beats {
port => 5044
}
tcp {
port => 5000
codec => json
}
}
filter {
# Parse application logs
if [type] == "application" {
grok {
match => {
"message" => "%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}"
}
}
date {
match => ["timestamp", "ISO8601"]
}
}
# Parse access logs
if [type] == "access" {
grok {
match => {
"message" => '%{IPORHOST:client_ip} - - \[%{HTTPDATE:timestamp}\] "%{WORD:method} %{URIPATHPARAM:request} HTTP/%{NUMBER:http_version}" %{NUMBER:status_code} %{NUMBER:bytes} "%{URI:referrer}" "%{GREEDYDATA:user_agent}"'
}
}
geoip {
source => "client_ip"
}
useragent {
source => "user_agent"
}
}
# Add metadata
mutate {
add_field => {
"environment" => "${ENVIRONMENT:production}"
"application" => "crm"
}
}
# Remove sensitive data
mutate {
remove_field => ["password", "token", "secret", "credit_card"]
}
}
output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "crm-%{type}-%{+YYYY.MM.dd}"
template_name => "crm"
template => "/etc/logstash/templates/crm.json"
}
# Send critical errors to Slack
if [level] == "ERROR" or [level] == "CRITICAL" {
http {
url => "${SLACK_WEBHOOK_URL}"
http_method => "post"
format => "json"
mapping => {
"text" => "🚨 Error in %{application}: %{message}"
"channel" => "#alerts"
}
}
}
}
Application Logging
<?php
// app/config/logging.php
use Monolog\Logger;
use Monolog\Handler\StreamHandler;
use Monolog\Handler\ElasticsearchHandler;
use Monolog\Formatter\JsonFormatter;
use Elasticsearch\ClientBuilder;
return [
'default' => 'stack',
'channels' => [
'stack' => [
'driver' => 'stack',
'channels' => ['daily', 'elasticsearch', 'sentry'],
],
'daily' => [
'driver' => 'daily',
'path' => storage_path('logs/crm.log'),
'level' => 'debug',
'days' => 14,
'formatter' => JsonFormatter::class,
],
'elasticsearch' => [
'driver' => 'custom',
'via' => function () {
$client = ClientBuilder::create()
->setHosts([env('ELASTICSEARCH_HOST')])
->build();
$handler = new ElasticsearchHandler($client, [
'index' => 'crm-logs',
'type' => '_doc',
]);
$handler->setFormatter(new JsonFormatter());
return new Logger('elasticsearch', [$handler]);
},
],
'sentry' => [
'driver' => 'sentry',
'level' => 'error',
'bubble' => true,
],
'performance' => [
'driver' => 'custom',
'via' => function () {
$handler = new StreamHandler(
storage_path('logs/performance.log'),
Logger::INFO
);
$handler->setFormatter(new JsonFormatter());
return new Logger('performance', [$handler]);
},
],
],
];
Alerting Strategy
Alert Configuration
# alerts/rules.yml
groups:
- name: application
interval: 30s
rules:
- alert: HighErrorRate
expr: rate(http_errors_total[5m]) > 0.05
for: 5m
labels:
severity: warning
team: backend
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value }} errors per second"
- alert: HighLatency
expr: histogram_quantile(0.95, http_request_duration_seconds) > 1
for: 5m
labels:
severity: warning
team: backend
annotations:
summary: "High latency detected"
description: "95th percentile latency is {{ $value }} seconds"
- alert: LowCacheHitRate
expr: cache_hit_rate < 80
for: 10m
labels:
severity: info
team: backend
annotations:
summary: "Low cache hit rate"
description: "Cache hit rate is {{ $value }}%"
- name: infrastructure
interval: 60s
rules:
- alert: HighCPUUsage
expr: cpu_usage_percent > 80
for: 5m
labels:
severity: warning
team: devops
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is {{ $value }}%"
- alert: LowDiskSpace
expr: disk_free_percent < 20
for: 5m
labels:
severity: critical
team: devops
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Only {{ $value }}% disk space remaining"
- alert: DatabaseConnectionPoolExhausted
expr: database_connections_active / database_connections_max > 0.9
for: 5m
labels:
severity: critical
team: database
annotations:
summary: "Database connection pool near exhaustion"
description: "{{ $value }}% of connections in use"
PagerDuty Integration
# scripts/pagerduty_integration.py
import requests
import json
from datetime import datetime
class PagerDutyAlert:
def __init__(self, api_key, service_id):
self.api_key = api_key
self.service_id = service_id
self.base_url = "https://api.pagerduty.com"
def trigger_incident(self, summary, severity="error", details=None):
"""Trigger a PagerDuty incident"""
payload = {
"incident": {
"type": "incident",
"title": summary,
"service": {
"id": self.service_id,
"type": "service_reference"
},
"urgency": self._get_urgency(severity),
"body": {
"type": "incident_body",
"details": details or summary
}
}
}
headers = {
"Authorization": f"Token token={self.api_key}",
"Content-Type": "application/json",
"From": "monitoring@ourcrm.com"
}
response = requests.post(
f"{self.base_url}/incidents",
headers=headers,
json=payload
)
if response.status_code == 201:
incident = response.json()["incident"]
return incident["id"]
else:
raise Exception(f"Failed to create incident: {response.text}")
def _get_urgency(self, severity):
"""Map severity to PagerDuty urgency"""
mapping = {
"critical": "high",
"error": "high",
"warning": "low",
"info": "low"
}
return mapping.get(severity, "low")
Dashboard Creation
Grafana Dashboard
{
"dashboard": {
"title": "CRM Application Dashboard",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{endpoint}}"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_errors_total[5m])",
"legendFormat": "{{status}}"
}
]
},
{
"title": "Response Time (p95)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, http_request_duration_seconds)",
"legendFormat": "95th percentile"
}
]
},
{
"title": "Active Users",
"type": "stat",
"targets": [
{
"expr": "active_users_total"
}
]
},
{
"title": "Database Connections",
"type": "gauge",
"targets": [
{
"expr": "mysql_connections_active / mysql_connections_max * 100"
}
]
},
{
"title": "Cache Hit Rate",
"type": "gauge",
"targets": [
{
"expr": "redis_hits / (redis_hits + redis_misses) * 100"
}
]
}
]
}
}
Health Checks
Application Health Endpoint
<?php
// app/controllers/HealthController.php
class HealthController extends Controller
{
public function check()
{
$health = [
'status' => 'healthy',
'timestamp' => time(),
'version' => getenv('APP_VERSION'),
'checks' => []
];
// Database check
try {
$db = $this->db;
$db->query("SELECT 1");
$health['checks']['database'] = [
'status' => 'healthy',
'response_time' => $this->measureTime(function() use ($db) {
$db->query("SELECT 1");
})
];
} catch (\Exception $e) {
$health['status'] = 'unhealthy';
$health['checks']['database'] = [
'status' => 'unhealthy',
'error' => $e->getMessage()
];
}
// Redis check
try {
$redis = $this->redis;
$redis->ping();
$health['checks']['redis'] = [
'status' => 'healthy',
'response_time' => $this->measureTime(function() use ($redis) {
$redis->ping();
})
];
} catch (\Exception $e) {
$health['status'] = 'degraded';
$health['checks']['redis'] = [
'status' => 'unhealthy',
'error' => $e->getMessage()
];
}
// Queue check
try {
$queue = $this->queue;
$queueSize = $queue->size();
$health['checks']['queue'] = [
'status' => $queueSize < 1000 ? 'healthy' : 'degraded',
'size' => $queueSize
];
} catch (\Exception $e) {
$health['checks']['queue'] = [
'status' => 'unhealthy',
'error' => $e->getMessage()
];
}
// Disk space check
$diskFree = disk_free_space('/');
$diskTotal = disk_total_space('/');
$diskPercent = ($diskFree / $diskTotal) * 100;
$health['checks']['disk'] = [
'status' => $diskPercent > 20 ? 'healthy' : 'degraded',
'free_percent' => round($diskPercent, 2)
];
// Memory check
$memoryUsage = memory_get_usage(true);
$memoryLimit = ini_get('memory_limit');
$health['checks']['memory'] = [
'status' => 'healthy',
'usage_mb' => round($memoryUsage / 1024 / 1024, 2),
'limit' => $memoryLimit
];
// Set appropriate HTTP status code
$statusCode = 200;
if ($health['status'] === 'degraded') {
$statusCode = 200; // Still operational
} elseif ($health['status'] === 'unhealthy') {
$statusCode = 503; // Service unavailable
}
return $this->response
->setStatusCode($statusCode)
->setJsonContent($health);
}
private function measureTime(callable $operation)
{
$start = microtime(true);
$operation();
return round((microtime(true) - $start) * 1000, 2); // in ms
}
}
Last Updated: January 2025 Version: 1.0.0