TL;DR

Copy and adapt these scripts for common SRE tasks: HTTP health checks with retry, Kubernetes pod/resource reporting, log pattern scanning, and Slack/webhook notifications. Each script follows the template from the Automation Basics page.

HTTP Health Check with Retry

Poll an endpoint until it returns a healthy status code, with configurable retries, backoff, and timeout — useful during deployments, rollouts, and smoke tests.

pythonhealth_check.py
#!/usr/bin/env python3
"""Poll an HTTP endpoint until healthy or timeout. Exit 0 on success, 1 on failure."""

import argparse
import logging
import sys
import time

import requests

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s — %(message)s", stream=sys.stderr)
log = logging.getLogger(__name__)


def wait_healthy(url: str, expected: int, attempts: int, delay: float, timeout: float) -> bool:
    session = requests.Session()
    for attempt in range(1, attempts + 1):
        try:
            r = session.get(url, timeout=timeout)
            if r.status_code == expected:
                log.info("Healthy: %s → %d (attempt %d)", url, r.status_code, attempt)
                return True
            log.warning("Attempt %d/%d: got %d, expected %d", attempt, attempts, r.status_code, expected)
        except requests.RequestException as e:
            log.warning("Attempt %d/%d: connection error — %s", attempt, attempts, e)
        if attempt < attempts:
            time.sleep(delay)
    return False


def main() -> int:
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument("url", help="Endpoint to poll, e.g. https://myapp/healthz")
    p.add_argument("--expected", type=int, default=200)
    p.add_argument("--attempts", type=int, default=10)
    p.add_argument("--delay",    type=float, default=5.0)
    p.add_argument("--timeout",  type=float, default=10.0)
    args = p.parse_args()
    ok = wait_healthy(args.url, args.expected, args.attempts, args.delay, args.timeout)
    return 0 if ok else 1


if __name__ == "__main__":
    sys.exit(main())

Kubernetes Resource Report

Generate a per-namespace summary of pod health, restart counts, and resource requests — pipe to a file or Slack during capacity reviews or hand-offs.

pythonk8s_report.py
#!/usr/bin/env python3
"""Print a pod health summary using the Kubernetes Python client.

Install: pip install kubernetes
"""

import sys
from kubernetes import client, config


def pod_restarts(pod) -> int:
    return sum(
        cs.restart_count
        for cs in (pod.status.container_statuses or [])
    )


def pod_phase(pod) -> str:
    for cs in (pod.status.container_statuses or []):
        if cs.state.waiting:
            return cs.state.waiting.reason or "Waiting"
    return pod.status.phase or "Unknown"


def main():
    config.load_kube_config()   # use in-cluster: config.load_incluster_config()
    v1 = client.CoreV1Api()

    namespace = sys.argv[1] if len(sys.argv) > 1 else None
    if namespace:
        pods = v1.list_namespaced_pod(namespace).items
    else:
        pods = v1.list_pod_for_all_namespaces().items

    print(f"{'NAMESPACE':<20} {'NAME':<45} {'PHASE':<20} {'RESTARTS':>8}")
    print("-" * 95)
    for pod in sorted(pods, key=lambda p: (p.metadata.namespace, pod_phase(p), -pod_restarts(p))):
        ns   = pod.metadata.namespace
        name = pod.metadata.name
        phase = pod_phase(pod)
        restarts = pod_restarts(pod)
        flag = " !" if restarts > 5 or phase not in ("Running", "Succeeded") else ""
        print(f"{ns:<20} {name:<45} {phase:<20} {restarts:>8}{flag}")


if __name__ == "__main__":
    main()

Log Error Scanner

Scan log files or stdin for error patterns, count occurrences, and output a ranked summary — useful for post-incident log analysis across multiple pods.

pythonlog_scan.py
#!/usr/bin/env python3
"""Scan log lines for error patterns and rank by frequency.

Usage:
  kubectl logs -n prod deploy/myapp | python log_scan.py
  python log_scan.py --file app.log --pattern "ERROR|FATAL|Exception"
"""

import argparse
import re
import sys
from collections import Counter


DEFAULT_PATTERN = r"(ERROR|FATAL|Exception|panic|OOMKill|timeout|refused)"


def scan(lines, pattern: str, context: int = 0) -> Counter:
    rx = re.compile(pattern, re.IGNORECASE)
    counts: Counter = Counter()
    for line in lines:
        m = rx.search(line)
        if m:
            # Use matched text as the key for grouping similar errors
            counts[m.group(0).lower()] += 1
    return counts


def main():
    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("--file", help="Log file to scan (default: stdin)")
    p.add_argument("--pattern", default=DEFAULT_PATTERN, help="Regex to match errors")
    p.add_argument("--top", type=int, default=20, help="Show top N patterns")
    args = p.parse_args()

    src = open(args.file) if args.file else sys.stdin
    counts = scan(src, args.pattern)
    if args.file:
        src.close()

    if not counts:
        print("No matches found.")
        return

    print(f"\n{'COUNT':>8}  PATTERN")
    print("-" * 40)
    for pattern, count in counts.most_common(args.top):
        print(f"{count:>8}  {pattern}")


if __name__ == "__main__":
    main()

Slack / Webhook Notifications

Send structured alerts to Slack or any webhook endpoint; keep the payload simple and include context (cluster, namespace, severity) so on-call engineers can act without opening another tab.

pythonnotify.py
#!/usr/bin/env python3
"""Send a structured Slack notification via Incoming Webhook.

Set SLACK_WEBHOOK_URL in the environment.
"""

import json
import logging
import os
import sys
import urllib.request

log = logging.getLogger(__name__)


def slack_notify(
    message: str,
    title: str = "SRE Alert",
    color: str = "danger",  # good | warning | danger
    fields: list[dict] | None = None,
) -> bool:
    """Post to Slack. Returns True on success."""
    webhook_url = os.environ.get("SLACK_WEBHOOK_URL")
    if not webhook_url:
        log.error("SLACK_WEBHOOK_URL not set")
        return False

    payload = {
        "attachments": [{
            "color": color,
            "title": title,
            "text": message,
            "fields": fields or [],
            "footer": "SRE Automation",
        }]
    }
    data = json.dumps(payload).encode()
    req = urllib.request.Request(
        webhook_url, data=data, headers={"Content-Type": "application/json"}
    )
    try:
        with urllib.request.urlopen(req, timeout=10) as resp:
            return resp.status == 200
    except Exception as exc:
        log.error("Slack notification failed: %s", exc)
        return False


# Example call
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, stream=sys.stderr)
    cluster = os.environ.get("CLUSTER_NAME", "unknown")
    ok = slack_notify(
        message="Pod `my-api-5f9b4d` in `prod` is in CrashLoopBackOff.",
        title=f":rotating_light: Incident Alert — {cluster}",
        color="danger",
        fields=[
            {"title": "Namespace", "value": "prod",         "short": True},
            {"title": "Restarts",  "value": "17",           "short": True},
            {"title": "Last Exit", "value": "OOMKilled",    "short": True},
        ],
    )
    sys.exit(0 if ok else 1)