# 资源查看
kubectl get pods -n namespace
kubectl describe pod <pod-name>
kubectl logs -f <pod-name> -c <container>
kubectl logs --tail=100 <pod-name>
kubectl logs --previous <pod-name> # 查看上一个容器日志
# 进入容器
kubectl exec -it <pod-name> -- /bin/sh
# 资源使用
kubectl top pods -n namespace
kubectl top nodes
# 事件查看
kubectl get events --sort-by='.lastTimestamp'
kubectl get events -w # 实时监控
# YAML 导出
kubectl get deploy myapp -o yaml > backup.yaml
| 状态 | 可能原因 | 排查命令 |
|---|---|---|
| Pending | 资源不足/调度失败 | kubectl describe pod 看 Events |
| CrashLoopBackOff | 应用启动失败 | kubectl logs 看错误日志 |
| ImagePullBackOff | 镜像拉取失败 | 检查镜像名/仓库认证 |
| Evicted | 节点资源压力 | 检查节点资源, 调整 limits |
| Terminating | 卡住无法删除 | kubectl delete pod --force --grace-period=0 |
# 节点排空 (滚动升级/维护)
kubectl drain node-1 --ignore-daemonsets --delete-emptydir-data
# 节点恢复
kubectl uncordon node-1
# 禁止调度 (调试)
kubectl cordon node-1
# 节点状态
kubectl describe node node-1 | grep -A5 Conditions
# ResourceQuota — 命名空间配额
apiVersion: v1
kind: ResourceQuota
metadata:
name: compute-quota
spec:
hard:
requests.cpu: "20"
requests.memory: "40Gi"
limits.cpu: "40"
limits.memory: "80Gi"
persistentvolumeclaims: "10"
# LimitRange — 默认资源限制
apiVersion: v1
kind: LimitRange
metadata:
name: defaults
spec:
limits:
- default:
cpu: "500m"
memory: "512Mi"
defaultRequest:
cpu: "100m"
memory: "128Mi"
type: Container
# Deployment 滚动更新
kubectl set image deploy/myapp app=myapp:v2
kubectl rollout status deploy/myapp
kubectl rollout history deploy/myapp
kubectl rollout undo deploy/myapp # 回滚
# 金丝雀发布 (手动)
# 1. 创建金丝雀 deployment (myapp-canary)
# 2. 小流量切到 canary
# 3. 监控/验证
# 4. 全量升级原 deployment