Skip to content

Rollback Procedures

Overview

This document provides comprehensive rollback procedures for various failure scenarios in our SaaS CRM application, ensuring minimal downtime and data integrity.

Rollback Strategy

Rollback Principles

  1. Fast Recovery: Prioritize service restoration over root cause analysis
  2. Data Integrity: Ensure no data loss during rollback
  3. Communication: Keep stakeholders informed throughout the process
  4. Documentation: Record all actions taken during rollback
  5. Post-Mortem: Analyze after service restoration

Rollback Decision Matrix

Scenario Severity Rollback Time Decision
Critical bug in production P0 < 15 min Immediate rollback
Performance degradation >50% P1 < 30 min Rollback after quick fix attempt
Feature bug affecting <10% users P2 < 2 hours Hotfix or feature flag
Minor UI issues P3 Next release Forward fix
Data corruption detected P0 Immediate Stop writes, rollback, restore

Application Rollback

Code Deployment Rollback

Automated Rollback Script

#!/bin/bash
# scripts/rollback.sh

set -e

# Configuration
ENVIRONMENT=${1:-production}
ROLLBACK_VERSION=${2:-previous}
DRY_RUN=${3:-false}

# Color codes for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

echo -e "${YELLOW}Starting rollback process for $ENVIRONMENT${NC}"

# Function to log actions
log_action() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> /var/log/rollback.log
    echo -e "${GREEN}${NC} $1"
}

# Function to handle errors
handle_error() {
    echo -e "${RED}✗ Error: $1${NC}"
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1" >> /var/log/rollback.log
    exit 1
}

# 1. Verify environment
if [[ ! "$ENVIRONMENT" =~ ^(production|staging|development)$ ]]; then
    handle_error "Invalid environment: $ENVIRONMENT"
fi

# 2. Get current version
CURRENT_VERSION=$(aws ecs describe-services \
    --cluster "${ENVIRONMENT}-cluster" \
    --services crm-app \
    --query 'services[0].taskDefinition' \
    --output text | sed 's/.*://')

log_action "Current version: $CURRENT_VERSION"

# 3. Determine rollback version
if [ "$ROLLBACK_VERSION" == "previous" ]; then
    ROLLBACK_VERSION=$((CURRENT_VERSION - 1))
fi

log_action "Rolling back to version: $ROLLBACK_VERSION"

# 4. Create backup of current state
if [ "$DRY_RUN" == "false" ]; then
    log_action "Creating backup of current state..."

    # Backup database
    mysqldump -h $DB_HOST -u $DB_USER -p$DB_PASS $DB_NAME > \
        "/backups/rollback_${CURRENT_VERSION}_$(date +%Y%m%d_%H%M%S).sql"

    # Backup application files
    tar -czf "/backups/app_${CURRENT_VERSION}_$(date +%Y%m%d_%H%M%S).tar.gz" \
        /var/www/html

    log_action "Backup completed"
fi

# 5. Enable maintenance mode
if [ "$DRY_RUN" == "false" ]; then
    log_action "Enabling maintenance mode..."
    aws s3 cp maintenance.html s3://crm-static/maintenance.html
    aws cloudfront create-invalidation \
        --distribution-id $CF_DISTRIBUTION_ID \
        --paths "/*"
fi

# 6. Perform rollback
if [ "$DRY_RUN" == "false" ]; then
    log_action "Starting rollback..."

    # Update ECS service
    aws ecs update-service \
        --cluster "${ENVIRONMENT}-cluster" \
        --service crm-app \
        --task-definition "crm-app:${ROLLBACK_VERSION}" \
        --force-new-deployment

    # Wait for rollback to complete
    log_action "Waiting for services to stabilize..."
    aws ecs wait services-stable \
        --cluster "${ENVIRONMENT}-cluster" \
        --services crm-app

    log_action "Rollback deployment completed"
else
    echo -e "${YELLOW}DRY RUN: Would rollback to version $ROLLBACK_VERSION${NC}"
fi

# 7. Verify health
log_action "Verifying application health..."
HEALTH_CHECK=$(curl -s -o /dev/null -w "%{http_code}" https://$APP_URL/health)

if [ "$HEALTH_CHECK" != "200" ]; then
    handle_error "Health check failed with status: $HEALTH_CHECK"
fi

# 8. Disable maintenance mode
if [ "$DRY_RUN" == "false" ]; then
    log_action "Disabling maintenance mode..."
    aws s3 rm s3://crm-static/maintenance.html
    aws cloudfront create-invalidation \
        --distribution-id $CF_DISTRIBUTION_ID \
        --paths "/*"
fi

# 9. Send notifications
if [ "$DRY_RUN" == "false" ]; then
    log_action "Sending notifications..."

    # Slack notification
    curl -X POST $SLACK_WEBHOOK_URL \
        -H 'Content-Type: application/json' \
        -d "{
            \"text\": \"🔄 Rollback completed\",
            \"blocks\": [{
                \"type\": \"section\",
                \"text\": {
                    \"type\": \"mrkdwn\",
                    \"text\": \"*Environment:* $ENVIRONMENT\n*From Version:* $CURRENT_VERSION\n*To Version:* $ROLLBACK_VERSION\n*Status:* ✅ Success\"
                }
            }]
        }"

    # Email notification
    aws ses send-email \
        --from "alerts@ourcrm.com" \
        --to "team@ourcrm.com" \
        --subject "Rollback Completed - $ENVIRONMENT" \
        --text "Rollback from version $CURRENT_VERSION to $ROLLBACK_VERSION completed successfully."
fi

echo -e "${GREEN}Rollback completed successfully!${NC}"

Docker Container Rollback

#!/bin/bash
# scripts/docker_rollback.sh

# Get previous image
PREVIOUS_IMAGE=$(docker images --format "table {{.Repository}}:{{.Tag}}" | \
    grep "crm-app" | head -2 | tail -1)

echo "Rolling back to image: $PREVIOUS_IMAGE"

# Stop current containers
docker-compose down

# Update docker-compose.yml with previous image
sed -i "s|image: crm-app:.*|image: $PREVIOUS_IMAGE|g" docker-compose.yml

# Start with previous image
docker-compose up -d

# Verify
docker-compose ps
docker-compose logs --tail=50

Kubernetes Rollback

#!/bin/bash
# scripts/k8s_rollback.sh

DEPLOYMENT="crm-app"
NAMESPACE="production"

# View rollout history
kubectl rollout history deployment/$DEPLOYMENT -n $NAMESPACE

# Rollback to previous version
kubectl rollout undo deployment/$DEPLOYMENT -n $NAMESPACE

# Or rollback to specific revision
# kubectl rollout undo deployment/$DEPLOYMENT --to-revision=2 -n $NAMESPACE

# Check rollout status
kubectl rollout status deployment/$DEPLOYMENT -n $NAMESPACE

# Verify pods
kubectl get pods -n $NAMESPACE -l app=$DEPLOYMENT

Database Rollback

Schema Rollback

-- rollback/2025_01_15_rollback.sql
-- Rollback for migration 2025_01_15_add_features.sql

START TRANSACTION;

-- Log rollback start
INSERT INTO migration_log (action, migration, started_at)
VALUES ('rollback_start', '2025_01_15_add_features', NOW());

-- Reverse schema changes
ALTER TABLE customers DROP COLUMN IF EXISTS new_feature_flag;
ALTER TABLE customers DROP INDEX IF EXISTS idx_feature_flag;

-- Restore previous column if renamed
ALTER TABLE customers CHANGE COLUMN temp_status status VARCHAR(50);

-- Drop new tables
DROP TABLE IF EXISTS feature_settings;

-- Restore constraints
ALTER TABLE orders 
ADD CONSTRAINT fk_orders_customer 
FOREIGN KEY (customer_id) REFERENCES customers(id);

-- Verify rollback
SELECT 
    COUNT(*) as table_check
FROM information_schema.columns
WHERE table_schema = 'crm'
    AND table_name = 'customers'
    AND column_name = 'new_feature_flag';

-- Should return 0
-- If not, rollback this transaction

-- Log rollback completion
INSERT INTO migration_log (action, migration, completed_at)
VALUES ('rollback_complete', '2025_01_15_add_features', NOW());

COMMIT;

Data Rollback

-- Backup and restore procedure
DELIMITER $$

CREATE PROCEDURE rollback_data_changes(
    IN p_backup_table VARCHAR(255),
    IN p_target_table VARCHAR(255),
    IN p_timestamp DATETIME
)
BEGIN
    DECLARE EXIT HANDLER FOR SQLEXCEPTION
    BEGIN
        ROLLBACK;
        SIGNAL SQLSTATE '45000' 
        SET MESSAGE_TEXT = 'Rollback failed, transaction rolled back';
    END;

    START TRANSACTION;

    -- Create temporary table with backup data
    SET @sql = CONCAT('CREATE TEMPORARY TABLE temp_rollback AS ',
                      'SELECT * FROM ', p_backup_table,
                      ' WHERE backup_timestamp = ?');
    PREPARE stmt FROM @sql;
    EXECUTE stmt USING p_timestamp;
    DEALLOCATE PREPARE stmt;

    -- Check if backup data exists
    SELECT COUNT(*) INTO @row_count FROM temp_rollback;

    IF @row_count = 0 THEN
        SIGNAL SQLSTATE '45000' 
        SET MESSAGE_TEXT = 'No backup data found for specified timestamp';
    END IF;

    -- Clear current data
    SET @sql = CONCAT('DELETE FROM ', p_target_table);
    PREPARE stmt FROM @sql;
    EXECUTE stmt;
    DEALLOCATE PREPARE stmt;

    -- Restore backup data
    SET @sql = CONCAT('INSERT INTO ', p_target_table, 
                      ' SELECT * FROM temp_rollback');
    PREPARE stmt FROM @sql;
    EXECUTE stmt;
    DEALLOCATE PREPARE stmt;

    -- Log the rollback
    INSERT INTO rollback_log (
        table_name,
        rollback_timestamp,
        restored_from,
        row_count
    ) VALUES (
        p_target_table,
        NOW(),
        p_timestamp,
        @row_count
    );

    DROP TEMPORARY TABLE temp_rollback;

    COMMIT;

    SELECT CONCAT('Successfully rolled back ', @row_count, ' rows') AS result;
END$$

DELIMITER ;

Point-in-Time Recovery

#!/bin/bash
# scripts/pitr_recovery.sh

# Configuration
BACKUP_BUCKET="s3://crm-backups"
RECOVERY_TIME="2025-01-15 14:30:00"
DB_NAME="crm"

echo "Starting point-in-time recovery to $RECOVERY_TIME"

# 1. Find the latest full backup before recovery time
FULL_BACKUP=$(aws s3 ls $BACKUP_BUCKET/full/ --recursive | \
    grep "\.sql\.gz" | \
    awk '{print $4}' | \
    sort -r | \
    head -1)

echo "Using full backup: $FULL_BACKUP"

# 2. Download and restore full backup
aws s3 cp "$BACKUP_BUCKET/$FULL_BACKUP" /tmp/full_backup.sql.gz
gunzip /tmp/full_backup.sql.gz

mysql -h localhost -u root -p$MYSQL_ROOT_PASS $DB_NAME < /tmp/full_backup.sql

# 3. Apply binary logs up to recovery time
BINLOGS=$(aws s3 ls $BACKUP_BUCKET/binlogs/ --recursive | \
    grep "mysql-bin" | \
    awk '{print $4}')

for binlog in $BINLOGS; do
    echo "Applying binary log: $binlog"
    aws s3 cp "$BACKUP_BUCKET/$binlog" /tmp/

    mysqlbinlog \
        --stop-datetime="$RECOVERY_TIME" \
        "/tmp/$(basename $binlog)" | \
        mysql -h localhost -u root -p$MYSQL_ROOT_PASS $DB_NAME
done

echo "Point-in-time recovery completed"

Feature Flag Rollback

Runtime Feature Disable

// utils/featureFlags.js
class FeatureFlags {
  constructor(redisClient) {
    this.redis = redisClient
    this.flags = new Map()
    this.pollInterval = 10000 // 10 seconds
  }

  async initialize() {
    await this.loadFlags()
    setInterval(() => this.loadFlags(), this.pollInterval)
  }

  async loadFlags() {
    try {
      const flags = await this.redis.hgetall('feature_flags')

      for (const [key, value] of Object.entries(flags)) {
        this.flags.set(key, JSON.parse(value))
      }

      console.log(`Loaded ${this.flags.size} feature flags`)
    } catch (error) {
      console.error('Failed to load feature flags:', error)
    }
  }

  isEnabled(flagName, userId = null) {
    const flag = this.flags.get(flagName)

    if (!flag) return false
    if (!flag.enabled) return false

    // Check user-specific overrides
    if (userId && flag.userOverrides) {
      if (flag.userOverrides.includes(userId)) {
        return true
      }
    }

    // Check percentage rollout
    if (flag.percentage && userId) {
      const hash = this.hashUserId(userId)
      return (hash % 100) < flag.percentage
    }

    return flag.enabled
  }

  async disable(flagName, reason = '') {
    const flag = this.flags.get(flagName) || {}
    flag.enabled = false
    flag.disabledAt = new Date().toISOString()
    flag.disabledReason = reason

    await this.redis.hset(
      'feature_flags',
      flagName,
      JSON.stringify(flag)
    )

    this.flags.set(flagName, flag)

    // Log the change
    await this.redis.lpush(
      'feature_flag_changes',
      JSON.stringify({
        flag: flagName,
        action: 'disabled',
        reason: reason,
        timestamp: new Date().toISOString()
      })
    )

    console.log(`Feature flag ${flagName} disabled: ${reason}`)
  }

  hashUserId(userId) {
    let hash = 0
    for (let i = 0; i < userId.length; i++) {
      hash = ((hash << 5) - hash) + userId.charCodeAt(i)
      hash = hash & hash
    }
    return Math.abs(hash)
  }
}

// Emergency disable script
async function emergencyDisableFeature(featureName) {
  const redis = require('redis').createClient()
  const flags = new FeatureFlags(redis)

  await flags.initialize()
  await flags.disable(featureName, 'Emergency rollback due to production issue')

  // Notify team
  await sendSlackAlert({
    text: `🚨 Feature ${featureName} has been disabled`,
    channel: '#alerts'
  })

  process.exit(0)
}

API Version Rollback

API Versioning Strategy

<?php
// app/middleware/ApiVersionMiddleware.php

class ApiVersionMiddleware
{
    private $supportedVersions = ['v1', 'v2', 'v3'];
    private $deprecatedVersions = ['v1'];
    private $versionOverrides = [];

    public function beforeExecuteRoute($dispatcher)
    {
        $request = $dispatcher->getDI()->get('request');
        $response = $dispatcher->getDI()->get('response');

        // Get requested version
        $version = $this->getRequestedVersion($request);

        // Check for emergency rollback override
        if ($this->isVersionDisabled($version)) {
            $fallbackVersion = $this->getFallbackVersion($version);

            // Log the rollback
            $this->logVersionRollback($version, $fallbackVersion);

            // Redirect to fallback version
            $version = $fallbackVersion;
        }

        // Route to appropriate version
        $dispatcher->setNamespaceName("App\\Controllers\\Api\\{$version}");
    }

    private function isVersionDisabled($version)
    {
        $redis = $this->getDI()->get('redis');
        return $redis->sismember('disabled_api_versions', $version);
    }

    private function getFallbackVersion($version)
    {
        $versionMap = [
            'v3' => 'v2',
            'v2' => 'v1',
            'v1' => 'v1'  // v1 cannot be disabled
        ];

        return $versionMap[$version] ?? 'v1';
    }

    private function logVersionRollback($from, $to)
    {
        $logger = $this->getDI()->get('logger');
        $logger->warning("API version rollback", [
            'from_version' => $from,
            'to_version' => $to,
            'timestamp' => time(),
            'client_ip' => $_SERVER['REMOTE_ADDR']
        ]);
    }
}

Infrastructure Rollback

AWS Infrastructure Rollback

#!/bin/bash
# scripts/infrastructure_rollback.sh

# Terraform state rollback
cd terraform/

# View state history
terraform state list
terraform state show aws_instance.web

# Rollback to previous state
terraform state pull > current.tfstate
cp current.tfstate backup_$(date +%Y%m%d_%H%M%S).tfstate

# Download previous state from S3
aws s3 cp s3://terraform-state/prod/terraform.tfstate.backup ./previous.tfstate

# Apply previous state
terraform state push previous.tfstate
terraform plan
terraform apply -auto-approve

# CloudFormation stack rollback
aws cloudformation cancel-update-stack --stack-name crm-production
aws cloudformation continue-update-rollback --stack-name crm-production

# Wait for rollback to complete
aws cloudformation wait stack-rollback-complete --stack-name crm-production

Load Balancer Rollback

#!/bin/bash
# scripts/lb_rollback.sh

ALB_ARN="arn:aws:elasticloadbalancing:us-east-1:123456789:loadbalancer/app/crm-alb"
TARGET_GROUP_BLUE="arn:aws:elasticloadbalancing:us-east-1:123456789:targetgroup/blue"
TARGET_GROUP_GREEN="arn:aws:elasticloadbalancing:us-east-1:123456789:targetgroup/green"

# Get current target group
CURRENT_TG=$(aws elbv2 describe-listeners \
    --load-balancer-arn $ALB_ARN \
    --query 'Listeners[0].DefaultActions[0].TargetGroupArn' \
    --output text)

# Switch to other target group
if [ "$CURRENT_TG" == "$TARGET_GROUP_BLUE" ]; then
    NEW_TG=$TARGET_GROUP_GREEN
else
    NEW_TG=$TARGET_GROUP_BLUE
fi

echo "Switching from $CURRENT_TG to $NEW_TG"

# Update listener
aws elbv2 modify-listener \
    --listener-arn $(aws elbv2 describe-listeners \
        --load-balancer-arn $ALB_ARN \
        --query 'Listeners[0].ListenerArn' \
        --output text) \
    --default-actions Type=forward,TargetGroupArn=$NEW_TG

echo "Load balancer rollback completed"

Rollback Verification

Automated Verification Script

#!/bin/bash
# scripts/verify_rollback.sh

CHECKS_PASSED=0
CHECKS_FAILED=0

# Function to run check
run_check() {
    local check_name=$1
    local check_command=$2

    echo -n "Checking $check_name... "

    if eval $check_command; then
        echo "✓ PASSED"
        ((CHECKS_PASSED++))
    else
        echo "✗ FAILED"
        ((CHECKS_FAILED++))
    fi
}

# Application checks
run_check "Application Health" \
    "curl -s https://api.ourcrm.com/health | jq -r .status | grep -q healthy"

run_check "Database Connectivity" \
    "mysql -h $DB_HOST -u $DB_USER -p$DB_PASS -e 'SELECT 1' > /dev/null 2>&1"

run_check "Redis Connectivity" \
    "redis-cli -h $REDIS_HOST ping | grep -q PONG"

run_check "API Response Time" \
    "[ $(curl -w '%{time_total}' -s -o /dev/null https://api.ourcrm.com/health) -lt 2 ]"

# Data integrity checks
run_check "Customer Data Integrity" \
    "mysql -h $DB_HOST -u $DB_USER -p$DB_PASS $DB_NAME \
    -e 'SELECT COUNT(*) FROM customers WHERE created_at > DATE_SUB(NOW(), INTERVAL 1 HOUR)' \
    | tail -1 | [ $(cat) -gt 0 ]"

run_check "No Orphaned Records" \
    "mysql -h $DB_HOST -u $DB_USER -p$DB_PASS $DB_NAME \
    -e 'SELECT COUNT(*) FROM orders WHERE customer_id NOT IN (SELECT id FROM customers)' \
    | tail -1 | [ $(cat) -eq 0 ]"

# Feature checks
run_check "Critical Features Enabled" \
    "redis-cli -h $REDIS_HOST hget feature_flags critical_feature | jq -r .enabled | grep -q true"

# Performance checks
run_check "CPU Usage Normal" \
    "[ $(top -bn1 | grep 'Cpu(s)' | awk '{print $2}' | cut -d'%' -f1) -lt 80 ]"

run_check "Memory Usage Normal" \
    "[ $(free -m | awk 'NR==2{printf \"%.0f\", $3*100/$2}') -lt 80 ]"

run_check "Disk Space Available" \
    "[ $(df -h / | awk 'NR==2{print $5}' | sed 's/%//') -lt 80 ]"

# Summary
echo "========================================="
echo "Rollback Verification Complete"
echo "Checks Passed: $CHECKS_PASSED"
echo "Checks Failed: $CHECKS_FAILED"
echo "========================================="

if [ $CHECKS_FAILED -gt 0 ]; then
    echo "⚠️  WARNING: Some checks failed. Manual intervention may be required."
    exit 1
else
    echo "✅ All checks passed. Rollback successful!"
    exit 0
fi

Post-Rollback Procedures

Incident Report Template

# Rollback Incident Report

**Date**: [Date]
**Time**: [Start Time] - [End Time]
**Severity**: P0/P1/P2
**Systems Affected**: [List of systems]

## Summary
Brief description of what happened and why rollback was necessary.

## Timeline
- **[Time]**: Issue detected
- **[Time]**: Decision to rollback made
- **[Time]**: Rollback initiated
- **[Time]**: Rollback completed
- **[Time]**: Service restored

## Impact
- **Users Affected**: [Number/%]
- **Duration**: [Total downtime]
- **Data Loss**: Yes/No
- **Revenue Impact**: [Estimate]

## Root Cause
Description of what caused the issue.

## Rollback Actions
1. [Action taken]
2. [Action taken]
3. [Action taken]

## Lessons Learned
- What went well
- What could be improved
- Action items for prevention

## Follow-up Actions
- [ ] Fix root cause
- [ ] Update rollback procedures
- [ ] Improve monitoring
- [ ] Team training

**Report Prepared By**: [Name]
**Reviewed By**: [Name]

Last Updated: January 2025 Version: 1.0.0