Python SRE Scripts
Copy and adapt these scripts for common SRE tasks: HTTP health checks with retry, Kubernetes pod/resource reporting, log pattern scanning, and Slack/webhook notifications. Each script follows the template from the Automation Basics page.
HTTP Health Check with Retry
Poll an endpoint until it returns a healthy status code, with configurable retries, backoff, and timeout — useful during deployments, rollouts, and smoke tests.
#!/usr/bin/env python3
"""Poll an HTTP endpoint until healthy or timeout. Exit 0 on success, 1 on failure."""
import argparse
import logging
import sys
import time
import requests
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s — %(message)s", stream=sys.stderr)
log = logging.getLogger(__name__)
def wait_healthy(url: str, expected: int, attempts: int, delay: float, timeout: float) -> bool:
session = requests.Session()
for attempt in range(1, attempts + 1):
try:
r = session.get(url, timeout=timeout)
if r.status_code == expected:
log.info("Healthy: %s → %d (attempt %d)", url, r.status_code, attempt)
return True
log.warning("Attempt %d/%d: got %d, expected %d", attempt, attempts, r.status_code, expected)
except requests.RequestException as e:
log.warning("Attempt %d/%d: connection error — %s", attempt, attempts, e)
if attempt < attempts:
time.sleep(delay)
return False
def main() -> int:
p = argparse.ArgumentParser(description=__doc__)
p.add_argument("url", help="Endpoint to poll, e.g. https://myapp/healthz")
p.add_argument("--expected", type=int, default=200)
p.add_argument("--attempts", type=int, default=10)
p.add_argument("--delay", type=float, default=5.0)
p.add_argument("--timeout", type=float, default=10.0)
args = p.parse_args()
ok = wait_healthy(args.url, args.expected, args.attempts, args.delay, args.timeout)
return 0 if ok else 1
if __name__ == "__main__":
sys.exit(main())Kubernetes Resource Report
Generate a per-namespace summary of pod health, restart counts, and resource requests — pipe to a file or Slack during capacity reviews or hand-offs.
#!/usr/bin/env python3
"""Print a pod health summary using the Kubernetes Python client.
Install: pip install kubernetes
"""
import sys
from kubernetes import client, config
def pod_restarts(pod) -> int:
return sum(
cs.restart_count
for cs in (pod.status.container_statuses or [])
)
def pod_phase(pod) -> str:
for cs in (pod.status.container_statuses or []):
if cs.state.waiting:
return cs.state.waiting.reason or "Waiting"
return pod.status.phase or "Unknown"
def main():
config.load_kube_config() # use in-cluster: config.load_incluster_config()
v1 = client.CoreV1Api()
namespace = sys.argv[1] if len(sys.argv) > 1 else None
if namespace:
pods = v1.list_namespaced_pod(namespace).items
else:
pods = v1.list_pod_for_all_namespaces().items
print(f"{'NAMESPACE':<20} {'NAME':<45} {'PHASE':<20} {'RESTARTS':>8}")
print("-" * 95)
for pod in sorted(pods, key=lambda p: (p.metadata.namespace, pod_phase(p), -pod_restarts(p))):
ns = pod.metadata.namespace
name = pod.metadata.name
phase = pod_phase(pod)
restarts = pod_restarts(pod)
flag = " !" if restarts > 5 or phase not in ("Running", "Succeeded") else ""
print(f"{ns:<20} {name:<45} {phase:<20} {restarts:>8}{flag}")
if __name__ == "__main__":
main()Log Error Scanner
Scan log files or stdin for error patterns, count occurrences, and output a ranked summary — useful for post-incident log analysis across multiple pods.
#!/usr/bin/env python3
"""Scan log lines for error patterns and rank by frequency.
Usage:
kubectl logs -n prod deploy/myapp | python log_scan.py
python log_scan.py --file app.log --pattern "ERROR|FATAL|Exception"
"""
import argparse
import re
import sys
from collections import Counter
DEFAULT_PATTERN = r"(ERROR|FATAL|Exception|panic|OOMKill|timeout|refused)"
def scan(lines, pattern: str, context: int = 0) -> Counter:
rx = re.compile(pattern, re.IGNORECASE)
counts: Counter = Counter()
for line in lines:
m = rx.search(line)
if m:
# Use matched text as the key for grouping similar errors
counts[m.group(0).lower()] += 1
return counts
def main():
p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
p.add_argument("--file", help="Log file to scan (default: stdin)")
p.add_argument("--pattern", default=DEFAULT_PATTERN, help="Regex to match errors")
p.add_argument("--top", type=int, default=20, help="Show top N patterns")
args = p.parse_args()
src = open(args.file) if args.file else sys.stdin
counts = scan(src, args.pattern)
if args.file:
src.close()
if not counts:
print("No matches found.")
return
print(f"\n{'COUNT':>8} PATTERN")
print("-" * 40)
for pattern, count in counts.most_common(args.top):
print(f"{count:>8} {pattern}")
if __name__ == "__main__":
main()Slack / Webhook Notifications
Send structured alerts to Slack or any webhook endpoint; keep the payload simple and include context (cluster, namespace, severity) so on-call engineers can act without opening another tab.
#!/usr/bin/env python3
"""Send a structured Slack notification via Incoming Webhook.
Set SLACK_WEBHOOK_URL in the environment.
"""
import json
import logging
import os
import sys
import urllib.request
log = logging.getLogger(__name__)
def slack_notify(
message: str,
title: str = "SRE Alert",
color: str = "danger", # good | warning | danger
fields: list[dict] | None = None,
) -> bool:
"""Post to Slack. Returns True on success."""
webhook_url = os.environ.get("SLACK_WEBHOOK_URL")
if not webhook_url:
log.error("SLACK_WEBHOOK_URL not set")
return False
payload = {
"attachments": [{
"color": color,
"title": title,
"text": message,
"fields": fields or [],
"footer": "SRE Automation",
}]
}
data = json.dumps(payload).encode()
req = urllib.request.Request(
webhook_url, data=data, headers={"Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=10) as resp:
return resp.status == 200
except Exception as exc:
log.error("Slack notification failed: %s", exc)
return False
# Example call
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, stream=sys.stderr)
cluster = os.environ.get("CLUSTER_NAME", "unknown")
ok = slack_notify(
message="Pod `my-api-5f9b4d` in `prod` is in CrashLoopBackOff.",
title=f":rotating_light: Incident Alert — {cluster}",
color="danger",
fields=[
{"title": "Namespace", "value": "prod", "short": True},
{"title": "Restarts", "value": "17", "short": True},
{"title": "Last Exit", "value": "OOMKilled", "short": True},
],
)
sys.exit(0 if ok else 1)