Skip to content

Monitoring & Observability

Overview

This document outlines the monitoring strategy, tools, and practices for maintaining visibility into our SaaS CRM application's health and performance.

Monitoring Stack

Tools Overview

Tool Purpose Metrics
Datadog APM & Infrastructure Application performance, traces
CloudWatch AWS Resources EC2, RDS, ALB metrics
Sentry Error Tracking Application errors, crashes
New Relic Real User Monitoring Frontend performance, user experience
Grafana Visualization Custom dashboards
Prometheus Metrics Collection Time-series data
ELK Stack Log Management Centralized logging
PagerDuty Incident Management Alert routing, on-call

Application Performance Monitoring (APM)

Datadog Configuration

# datadog/config.yaml
api_key: ${DD_API_KEY}
site: datadoghq.com
hostname: ${HOSTNAME}

logs_enabled: true
process_config:
  enabled: true

apm_config:
  enabled: true
  env: production
  service_name: crm-app

  analyzed_spans:
    - service: mysql
      operation: query
    - service: redis
      operation: command
    - service: http
      operation: request

  filter_tags:
    reject: ["password", "token", "secret"]

  sampling_rules:
    - service: crm-api
      sample_rate: 0.1  # 10% sampling
    - service: crm-frontend
      sample_rate: 0.05  # 5% sampling

Frontend Monitoring

// plugins/monitoring.client.js
import { datadogRum } from '@datadog/browser-rum'
import * as Sentry from '@sentry/vue'

export default defineNuxtPlugin((nuxtApp) => {
  // Datadog RUM
  datadogRum.init({
    applicationId: process.env.DD_APPLICATION_ID,
    clientToken: process.env.DD_CLIENT_TOKEN,
    site: 'datadoghq.com',
    service: 'crm-frontend',
    env: process.env.NODE_ENV,
    version: process.env.APP_VERSION,
    sessionSampleRate: 100,
    trackInteractions: true,
    defaultPrivacyLevel: 'mask-user-input'
  })

  // Sentry Error Tracking
  Sentry.init({
    app: nuxtApp.vueApp,
    dsn: process.env.SENTRY_DSN,
    environment: process.env.NODE_ENV,
    tracesSampleRate: 0.1,
    integrations: [
      new Sentry.BrowserTracing(),
      new Sentry.Replay({
        maskAllText: true,
        blockAllMedia: true
      })
    ],
    beforeSend(event, hint) {
      // Filter sensitive data
      if (event.request) {
        delete event.request.cookies
        delete event.request.headers
      }
      return event
    }
  })

  // Custom performance monitoring
  if (typeof window !== 'undefined') {
    // Track page load performance
    window.addEventListener('load', () => {
      const perfData = performance.getEntriesByType('navigation')[0]
      datadogRum.addTiming('page_load', perfData.loadEventEnd)

      // Track Core Web Vitals
      if ('PerformanceObserver' in window) {
        // Largest Contentful Paint
        new PerformanceObserver((entryList) => {
          const entries = entryList.getEntries()
          const lastEntry = entries[entries.length - 1]
          datadogRum.addTiming('lcp', lastEntry.renderTime || lastEntry.loadTime)
        }).observe({ type: 'largest-contentful-paint', buffered: true })

        // First Input Delay
        new PerformanceObserver((entryList) => {
          const firstInput = entryList.getEntries()[0]
          datadogRum.addTiming('fid', firstInput.processingStart - firstInput.startTime)
        }).observe({ type: 'first-input', buffered: true })

        // Cumulative Layout Shift
        let clsValue = 0
        new PerformanceObserver((entryList) => {
          for (const entry of entryList.getEntries()) {
            if (!entry.hadRecentInput) {
              clsValue += entry.value
            }
          }
          datadogRum.addTiming('cls', clsValue)
        }).observe({ type: 'layout-shift', buffered: true })
      }
    })
  }
})

Backend Monitoring

<?php
// app/middleware/MonitoringMiddleware.php

namespace App\Middleware;

use DDTrace\GlobalTracer;
use Prometheus\CollectorRegistry;
use Prometheus\Storage\Redis;

class MonitoringMiddleware
{
    private $tracer;
    private $metrics;

    public function __construct()
    {
        $this->tracer = GlobalTracer::get();
        $this->metrics = new CollectorRegistry(new Redis());
    }

    public function beforeExecuteRoute($dispatcher)
    {
        // Start APM trace
        $span = $this->tracer->startActiveSpan('http.request');
        $span->setTag('http.method', $_SERVER['REQUEST_METHOD']);
        $span->setTag('http.url', $_SERVER['REQUEST_URI']);
        $span->setTag('http.client_ip', $_SERVER['REMOTE_ADDR']);

        // Record metrics
        $requestCounter = $this->metrics->getOrRegisterCounter(
            'crm',
            'http_requests_total',
            'Total HTTP requests',
            ['method', 'endpoint']
        );

        $requestCounter->inc([
            $_SERVER['REQUEST_METHOD'],
            $dispatcher->getControllerName() . '/' . $dispatcher->getActionName()
        ]);

        // Start timing
        $startTime = microtime(true);
        $dispatcher->setParam('_monitoring_start', $startTime);
    }

    public function afterExecuteRoute($dispatcher)
    {
        $span = $this->tracer->getActiveSpan();
        $startTime = $dispatcher->getParam('_monitoring_start');
        $duration = microtime(true) - $startTime;

        // Record response metrics
        $response = $dispatcher->getReturnedValue();
        $statusCode = $response->getStatusCode();

        $span->setTag('http.status_code', $statusCode);
        $span->setTag('http.duration', $duration);

        // Record latency histogram
        $latencyHistogram = $this->metrics->getOrRegisterHistogram(
            'crm',
            'http_request_duration_seconds',
            'HTTP request latency',
            ['method', 'endpoint', 'status']
        );

        $latencyHistogram->observe($duration, [
            $_SERVER['REQUEST_METHOD'],
            $dispatcher->getControllerName() . '/' . $dispatcher->getActionName(),
            $statusCode
        ]);

        // Track errors
        if ($statusCode >= 400) {
            $errorCounter = $this->metrics->getOrRegisterCounter(
                'crm',
                'http_errors_total',
                'Total HTTP errors',
                ['method', 'endpoint', 'status']
            );

            $errorCounter->inc([
                $_SERVER['REQUEST_METHOD'],
                $dispatcher->getControllerName() . '/' . $dispatcher->getActionName(),
                $statusCode
            ]);

            // Send to Sentry if 5xx error
            if ($statusCode >= 500) {
                \Sentry\captureMessage(
                    "HTTP {$statusCode} Error",
                    \Sentry\Severity::error()
                );
            }
        }

        $span->finish();
    }
}

Infrastructure Monitoring

CloudWatch Metrics

# scripts/custom_metrics.py
import boto3
import psutil
import time
from datetime import datetime

cloudwatch = boto3.client('cloudwatch', region_name='us-east-1')

def send_custom_metrics():
    """Send custom metrics to CloudWatch"""

    # System metrics
    cpu_percent = psutil.cpu_percent(interval=1)
    memory = psutil.virtual_memory()
    disk = psutil.disk_usage('/')

    # Network metrics
    net_io = psutil.net_io_counters()

    # Application metrics
    active_connections = get_active_connections()
    queue_size = get_queue_size()
    cache_hit_rate = get_cache_hit_rate()

    metrics = [
        {
            'MetricName': 'CPUUtilization',
            'Value': cpu_percent,
            'Unit': 'Percent',
            'Timestamp': datetime.utcnow()
        },
        {
            'MetricName': 'MemoryUtilization',
            'Value': memory.percent,
            'Unit': 'Percent',
            'Timestamp': datetime.utcnow()
        },
        {
            'MetricName': 'DiskUtilization',
            'Value': disk.percent,
            'Unit': 'Percent',
            'Timestamp': datetime.utcnow()
        },
        {
            'MetricName': 'NetworkBytesIn',
            'Value': net_io.bytes_recv,
            'Unit': 'Bytes',
            'Timestamp': datetime.utcnow()
        },
        {
            'MetricName': 'NetworkBytesOut',
            'Value': net_io.bytes_sent,
            'Unit': 'Bytes',
            'Timestamp': datetime.utcnow()
        },
        {
            'MetricName': 'ActiveConnections',
            'Value': active_connections,
            'Unit': 'Count',
            'Timestamp': datetime.utcnow()
        },
        {
            'MetricName': 'QueueSize',
            'Value': queue_size,
            'Unit': 'Count',
            'Timestamp': datetime.utcnow()
        },
        {
            'MetricName': 'CacheHitRate',
            'Value': cache_hit_rate,
            'Unit': 'Percent',
            'Timestamp': datetime.utcnow()
        }
    ]

    # Send metrics in batches
    for i in range(0, len(metrics), 20):
        batch = metrics[i:i+20]
        cloudwatch.put_metric_data(
            Namespace='CRM/Application',
            MetricData=batch
        )

    print(f"Sent {len(metrics)} metrics to CloudWatch")

def get_active_connections():
    """Get number of active database connections"""
    # Implementation depends on your database
    return 42

def get_queue_size():
    """Get message queue size"""
    # Implementation depends on your queue system
    return 156

def get_cache_hit_rate():
    """Calculate cache hit rate"""
    # Implementation depends on your cache system
    return 92.5

if __name__ == "__main__":
    while True:
        send_custom_metrics()
        time.sleep(60)  # Send metrics every minute

Server Monitoring

# ansible/monitoring.yml
---
- name: Configure monitoring agents
  hosts: all
  become: yes

  tasks:
    - name: Install Node Exporter
      get_url:
        url: https://github.com/prometheus/node_exporter/releases/download/v1.5.0/node_exporter-1.5.0.linux-amd64.tar.gz
        dest: /tmp/node_exporter.tar.gz

    - name: Extract Node Exporter
      unarchive:
        src: /tmp/node_exporter.tar.gz
        dest: /usr/local/bin
        remote_src: yes

    - name: Create Node Exporter service
      copy:
        content: |
          [Unit]
          Description=Node Exporter
          After=network.target

          [Service]
          Type=simple
          ExecStart=/usr/local/bin/node_exporter
          Restart=always

          [Install]
          WantedBy=multi-user.target
        dest: /etc/systemd/system/node_exporter.service

    - name: Start Node Exporter
      systemd:
        name: node_exporter
        state: started
        enabled: yes

    - name: Install Datadog Agent
      shell: |
        DD_API_KEY="{{ datadog_api_key }}" \
        DD_SITE="datadoghq.com" \
        bash -c "$(curl -L https://s3.amazonaws.com/dd-agent/scripts/install_script.sh)"

    - name: Configure Datadog
      template:
        src: datadog.yaml.j2
        dest: /etc/datadog-agent/datadog.yaml
      notify: restart datadog-agent

Database Monitoring

MySQL Monitoring Queries

-- Performance Schema Queries
-- Top slow queries
SELECT 
    digest_text,
    count_star,
    avg_timer_wait/1000000000 AS avg_time_ms,
    sum_timer_wait/1000000000 AS total_time_ms
FROM performance_schema.events_statements_summary_by_digest
ORDER BY sum_timer_wait DESC
LIMIT 10;

-- Connection statistics
SELECT 
    variable_value AS current_connections
FROM performance_schema.global_status
WHERE variable_name = 'Threads_connected';

-- Table statistics
SELECT 
    table_schema,
    table_name,
    rows_read,
    rows_inserted,
    rows_updated,
    rows_deleted
FROM performance_schema.table_io_waits_summary_by_table
WHERE table_schema = 'crm'
ORDER BY rows_read DESC;

-- Lock waits
SELECT 
    waiting_trx_id,
    waiting_pid,
    waiting_query,
    blocking_trx_id,
    blocking_pid,
    blocking_query
FROM sys.innodb_lock_waits;

-- Buffer pool statistics
SELECT 
    page_type,
    pool_id,
    lru_position,
    COUNT(*) AS page_count
FROM information_schema.innodb_buffer_page
GROUP BY page_type, pool_id
ORDER BY page_count DESC;

Database Monitoring Script

#!/bin/bash
# scripts/monitor_mysql.sh

MYSQL_HOST="localhost"
MYSQL_USER="monitoring"
MYSQL_PASS="$MYSQL_MONITORING_PASSWORD"

# Function to execute query and send to monitoring
function monitor_metric() {
    local metric_name=$1
    local query=$2
    local value=$(mysql -h $MYSQL_HOST -u $MYSQL_USER -p$MYSQL_PASS -sN -e "$query")

    # Send to Datadog
    echo "mysql.$metric_name:$value|g" | nc -u -w0 127.0.0.1 8125
}

# Monitor connections
monitor_metric "connections.current" \
    "SELECT variable_value FROM performance_schema.global_status WHERE variable_name = 'Threads_connected'"

# Monitor slow queries
monitor_metric "slow_queries.count" \
    "SELECT variable_value FROM performance_schema.global_status WHERE variable_name = 'Slow_queries'"

# Monitor query cache hit rate
monitor_metric "query_cache.hit_rate" \
    "SELECT (Qcache_hits / (Qcache_hits + Qcache_inserts + Qcache_not_cached)) * 100 
     FROM (SELECT 
        (SELECT variable_value FROM performance_schema.global_status WHERE variable_name = 'Qcache_hits') as Qcache_hits,
        (SELECT variable_value FROM performance_schema.global_status WHERE variable_name = 'Qcache_inserts') as Qcache_inserts,
        (SELECT variable_value FROM performance_schema.global_status WHERE variable_name = 'Qcache_not_cached') as Qcache_not_cached
     ) as cache_stats"

# Monitor replication lag (if slave)
monitor_metric "replication.lag_seconds" \
    "SELECT Seconds_Behind_Master FROM SHOW SLAVE STATUS"

# Monitor table sizes
mysql -h $MYSQL_HOST -u $MYSQL_USER -p$MYSQL_PASS -sN -e "
    SELECT 
        CONCAT('table.size.', table_name, ':', ROUND(data_length / 1024 / 1024, 2))
    FROM information_schema.tables
    WHERE table_schema = 'crm'
" | while read metric; do
    echo "$metric|g" | nc -u -w0 127.0.0.1 8125
done

Log Management

ELK Stack Configuration

# elasticsearch/elasticsearch.yml
cluster.name: crm-logs
node.name: es-node-1
network.host: 0.0.0.0
discovery.seed_hosts: ["es-node-1", "es-node-2", "es-node-3"]
cluster.initial_master_nodes: ["es-node-1"]

# Index lifecycle management
xpack.ilm.enabled: true

# Security
xpack.security.enabled: true
xpack.security.transport.ssl.enabled: true
# logstash/pipeline/crm.conf
input {
  beats {
    port => 5044
  }

  tcp {
    port => 5000
    codec => json
  }
}

filter {
  # Parse application logs
  if [type] == "application" {
    grok {
      match => {
        "message" => "%{TIMESTAMP_ISO8601:timestamp} %{LOGLEVEL:level} %{GREEDYDATA:message}"
      }
    }

    date {
      match => ["timestamp", "ISO8601"]
    }
  }

  # Parse access logs
  if [type] == "access" {
    grok {
      match => {
        "message" => '%{IPORHOST:client_ip} - - \[%{HTTPDATE:timestamp}\] "%{WORD:method} %{URIPATHPARAM:request} HTTP/%{NUMBER:http_version}" %{NUMBER:status_code} %{NUMBER:bytes} "%{URI:referrer}" "%{GREEDYDATA:user_agent}"'
      }
    }

    geoip {
      source => "client_ip"
    }

    useragent {
      source => "user_agent"
    }
  }

  # Add metadata
  mutate {
    add_field => {
      "environment" => "${ENVIRONMENT:production}"
      "application" => "crm"
    }
  }

  # Remove sensitive data
  mutate {
    remove_field => ["password", "token", "secret", "credit_card"]
  }
}

output {
  elasticsearch {
    hosts => ["elasticsearch:9200"]
    index => "crm-%{type}-%{+YYYY.MM.dd}"
    template_name => "crm"
    template => "/etc/logstash/templates/crm.json"
  }

  # Send critical errors to Slack
  if [level] == "ERROR" or [level] == "CRITICAL" {
    http {
      url => "${SLACK_WEBHOOK_URL}"
      http_method => "post"
      format => "json"
      mapping => {
        "text" => "🚨 Error in %{application}: %{message}"
        "channel" => "#alerts"
      }
    }
  }
}

Application Logging

<?php
// app/config/logging.php

use Monolog\Logger;
use Monolog\Handler\StreamHandler;
use Monolog\Handler\ElasticsearchHandler;
use Monolog\Formatter\JsonFormatter;
use Elasticsearch\ClientBuilder;

return [
    'default' => 'stack',

    'channels' => [
        'stack' => [
            'driver' => 'stack',
            'channels' => ['daily', 'elasticsearch', 'sentry'],
        ],

        'daily' => [
            'driver' => 'daily',
            'path' => storage_path('logs/crm.log'),
            'level' => 'debug',
            'days' => 14,
            'formatter' => JsonFormatter::class,
        ],

        'elasticsearch' => [
            'driver' => 'custom',
            'via' => function () {
                $client = ClientBuilder::create()
                    ->setHosts([env('ELASTICSEARCH_HOST')])
                    ->build();

                $handler = new ElasticsearchHandler($client, [
                    'index' => 'crm-logs',
                    'type' => '_doc',
                ]);

                $handler->setFormatter(new JsonFormatter());

                return new Logger('elasticsearch', [$handler]);
            },
        ],

        'sentry' => [
            'driver' => 'sentry',
            'level' => 'error',
            'bubble' => true,
        ],

        'performance' => [
            'driver' => 'custom',
            'via' => function () {
                $handler = new StreamHandler(
                    storage_path('logs/performance.log'),
                    Logger::INFO
                );

                $handler->setFormatter(new JsonFormatter());

                return new Logger('performance', [$handler]);
            },
        ],
    ],
];

Alerting Strategy

Alert Configuration

# alerts/rules.yml
groups:
  - name: application
    interval: 30s
    rules:
      - alert: HighErrorRate
        expr: rate(http_errors_total[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value }} errors per second"

      - alert: HighLatency
        expr: histogram_quantile(0.95, http_request_duration_seconds) > 1
        for: 5m
        labels:
          severity: warning
          team: backend
        annotations:
          summary: "High latency detected"
          description: "95th percentile latency is {{ $value }} seconds"

      - alert: LowCacheHitRate
        expr: cache_hit_rate < 80
        for: 10m
        labels:
          severity: info
          team: backend
        annotations:
          summary: "Low cache hit rate"
          description: "Cache hit rate is {{ $value }}%"

  - name: infrastructure
    interval: 60s
    rules:
      - alert: HighCPUUsage
        expr: cpu_usage_percent > 80
        for: 5m
        labels:
          severity: warning
          team: devops
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is {{ $value }}%"

      - alert: LowDiskSpace
        expr: disk_free_percent < 20
        for: 5m
        labels:
          severity: critical
          team: devops
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: "Only {{ $value }}% disk space remaining"

      - alert: DatabaseConnectionPoolExhausted
        expr: database_connections_active / database_connections_max > 0.9
        for: 5m
        labels:
          severity: critical
          team: database
        annotations:
          summary: "Database connection pool near exhaustion"
          description: "{{ $value }}% of connections in use"

PagerDuty Integration

# scripts/pagerduty_integration.py
import requests
import json
from datetime import datetime

class PagerDutyAlert:
    def __init__(self, api_key, service_id):
        self.api_key = api_key
        self.service_id = service_id
        self.base_url = "https://api.pagerduty.com"

    def trigger_incident(self, summary, severity="error", details=None):
        """Trigger a PagerDuty incident"""

        payload = {
            "incident": {
                "type": "incident",
                "title": summary,
                "service": {
                    "id": self.service_id,
                    "type": "service_reference"
                },
                "urgency": self._get_urgency(severity),
                "body": {
                    "type": "incident_body",
                    "details": details or summary
                }
            }
        }

        headers = {
            "Authorization": f"Token token={self.api_key}",
            "Content-Type": "application/json",
            "From": "monitoring@ourcrm.com"
        }

        response = requests.post(
            f"{self.base_url}/incidents",
            headers=headers,
            json=payload
        )

        if response.status_code == 201:
            incident = response.json()["incident"]
            return incident["id"]
        else:
            raise Exception(f"Failed to create incident: {response.text}")

    def _get_urgency(self, severity):
        """Map severity to PagerDuty urgency"""
        mapping = {
            "critical": "high",
            "error": "high",
            "warning": "low",
            "info": "low"
        }
        return mapping.get(severity, "low")

Dashboard Creation

Grafana Dashboard

{
  "dashboard": {
    "title": "CRM Application Dashboard",
    "panels": [
      {
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])",
            "legendFormat": "{{method}} {{endpoint}}"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_errors_total[5m])",
            "legendFormat": "{{status}}"
          }
        ]
      },
      {
        "title": "Response Time (p95)",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, http_request_duration_seconds)",
            "legendFormat": "95th percentile"
          }
        ]
      },
      {
        "title": "Active Users",
        "type": "stat",
        "targets": [
          {
            "expr": "active_users_total"
          }
        ]
      },
      {
        "title": "Database Connections",
        "type": "gauge",
        "targets": [
          {
            "expr": "mysql_connections_active / mysql_connections_max * 100"
          }
        ]
      },
      {
        "title": "Cache Hit Rate",
        "type": "gauge",
        "targets": [
          {
            "expr": "redis_hits / (redis_hits + redis_misses) * 100"
          }
        ]
      }
    ]
  }
}

Health Checks

Application Health Endpoint

<?php
// app/controllers/HealthController.php

class HealthController extends Controller
{
    public function check()
    {
        $health = [
            'status' => 'healthy',
            'timestamp' => time(),
            'version' => getenv('APP_VERSION'),
            'checks' => []
        ];

        // Database check
        try {
            $db = $this->db;
            $db->query("SELECT 1");
            $health['checks']['database'] = [
                'status' => 'healthy',
                'response_time' => $this->measureTime(function() use ($db) {
                    $db->query("SELECT 1");
                })
            ];
        } catch (\Exception $e) {
            $health['status'] = 'unhealthy';
            $health['checks']['database'] = [
                'status' => 'unhealthy',
                'error' => $e->getMessage()
            ];
        }

        // Redis check
        try {
            $redis = $this->redis;
            $redis->ping();
            $health['checks']['redis'] = [
                'status' => 'healthy',
                'response_time' => $this->measureTime(function() use ($redis) {
                    $redis->ping();
                })
            ];
        } catch (\Exception $e) {
            $health['status'] = 'degraded';
            $health['checks']['redis'] = [
                'status' => 'unhealthy',
                'error' => $e->getMessage()
            ];
        }

        // Queue check
        try {
            $queue = $this->queue;
            $queueSize = $queue->size();
            $health['checks']['queue'] = [
                'status' => $queueSize < 1000 ? 'healthy' : 'degraded',
                'size' => $queueSize
            ];
        } catch (\Exception $e) {
            $health['checks']['queue'] = [
                'status' => 'unhealthy',
                'error' => $e->getMessage()
            ];
        }

        // Disk space check
        $diskFree = disk_free_space('/');
        $diskTotal = disk_total_space('/');
        $diskPercent = ($diskFree / $diskTotal) * 100;

        $health['checks']['disk'] = [
            'status' => $diskPercent > 20 ? 'healthy' : 'degraded',
            'free_percent' => round($diskPercent, 2)
        ];

        // Memory check
        $memoryUsage = memory_get_usage(true);
        $memoryLimit = ini_get('memory_limit');

        $health['checks']['memory'] = [
            'status' => 'healthy',
            'usage_mb' => round($memoryUsage / 1024 / 1024, 2),
            'limit' => $memoryLimit
        ];

        // Set appropriate HTTP status code
        $statusCode = 200;
        if ($health['status'] === 'degraded') {
            $statusCode = 200;  // Still operational
        } elseif ($health['status'] === 'unhealthy') {
            $statusCode = 503;  // Service unavailable
        }

        return $this->response
            ->setStatusCode($statusCode)
            ->setJsonContent($health);
    }

    private function measureTime(callable $operation)
    {
        $start = microtime(true);
        $operation();
        return round((microtime(true) - $start) * 1000, 2);  // in ms
    }
}

Last Updated: January 2025 Version: 1.0.0