EKS Production Deployment Guide
This guide covers deploying a production-grade EKS cluster using DevOpsGenie's Terraform modules.
Cluster Configuration
Terraform Module Reference
terraform/environments/production/main.tf
module "devopsgenie_cluster" {
source = "devopsgenie/eks/aws"
version = "~> 2.4"
cluster_name = "my-platform-production"
cluster_version = "1.29"
region = "us-east-1"
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets
# Control plane logging
cluster_enabled_log_types = [
"api", "audit", "authenticator", "controllerManager", "scheduler"
]
# Cluster endpoint access
cluster_endpoint_private_access = true
cluster_endpoint_public_access = true
cluster_endpoint_public_access_cidrs = [
"10.0.0.0/8", # internal networks
]
# Managed node groups
eks_managed_node_groups = {
system = {
name = "system-nodes"
instance_types = ["m6i.xlarge"]
min_size = 2
max_size = 5
desired_size = 3
labels = {
role = "system"
}
taints = [{
key = "CriticalAddonsOnly"
value = "true"
effect = "NO_SCHEDULE"
}]
update_config = {
max_unavailable_percentage = 33
}
}
workloads = {
name = "workload-nodes"
instance_types = ["m6i.2xlarge", "m6i.4xlarge", "m6a.2xlarge"]
min_size = 3
max_size = 20
desired_size = 5
labels = {
role = "workloads"
}
block_device_mappings = {
xvda = {
device_name = "/dev/xvda"
ebs = {
volume_size = 100
volume_type = "gp3"
iops = 3000
throughput = 125
delete_on_termination = true
encrypted = true
}
}
}
}
}
# EKS add-ons
cluster_addons = {
coredns = {
most_recent = true
configuration_values = jsonencode({
replicaCount = 2
resources = {
limits = { cpu = "200m", memory = "200Mi" }
requests = { cpu = "100m", memory = "100Mi" }
}
})
}
kube-proxy = {
most_recent = true
}
vpc-cni = {
most_recent = true
configuration_values = jsonencode({
env = {
ENABLE_PREFIX_DELEGATION = "true"
WARM_PREFIX_TARGET = "1"
}
})
}
aws-ebs-csi-driver = {
most_recent = true
service_account_role_arn = module.ebs_csi_irsa_role.iam_role_arn
}
}
tags = {
Environment = "production"
ManagedBy = "devopsgenie"
Team = "platform"
}
}
VPC Configuration
terraform/environments/production/vpc.tf
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "~> 5.0"
name = "my-platform-production"
cidr = "10.100.0.0/16"
azs = ["us-east-1a", "us-east-1b", "us-east-1c"]
private_subnets = ["10.100.0.0/20", "10.100.16.0/20", "10.100.32.0/20"]
public_subnets = ["10.100.48.0/24", "10.100.49.0/24", "10.100.50.0/24"]
enable_nat_gateway = true
single_nat_gateway = false # one NAT per AZ for HA
enable_dns_hostnames = true
enable_dns_support = true
# Tags required for EKS subnet discovery
private_subnet_tags = {
"kubernetes.io/role/internal-elb" = "1"
"kubernetes.io/cluster/my-platform-production" = "shared"
}
public_subnet_tags = {
"kubernetes.io/role/elb" = "1"
"kubernetes.io/cluster/my-platform-production" = "shared"
}
}
Karpenter Configuration
Karpenter provides highly efficient node lifecycle management and replaces Cluster Autoscaler for workload nodes.
kubernetes/karpenter/nodepool.yaml
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: default
spec:
template:
metadata:
labels:
role: workloads
spec:
nodeClassRef:
name: default
requirements:
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand", "spot"]
- key: node.kubernetes.io/instance-type
operator: In
values:
- m6i.xlarge
- m6i.2xlarge
- m6i.4xlarge
- m6a.xlarge
- m6a.2xlarge
- m6a.4xlarge
- key: topology.kubernetes.io/zone
operator: In
values: ["us-east-1a", "us-east-1b", "us-east-1c"]
limits:
cpu: "200"
memory: 400Gi
disruption:
consolidationPolicy: WhenUnderutilized
consolidateAfter: 30s
expireAfter: 720h # rotate nodes every 30 days
---
apiVersion: karpenter.k8s.aws/v1beta1
kind: EC2NodeClass
metadata:
name: default
spec:
amiFamily: AL2
role: "KarpenterNodeRole-my-platform-production"
subnetSelectorTerms:
- tags:
kubernetes.io/cluster/my-platform-production: shared
kubernetes.io/role/internal-elb: "1"
securityGroupSelectorTerms:
- tags:
kubernetes.io/cluster/my-platform-production: owned
blockDeviceMappings:
- deviceName: /dev/xvda
ebs:
volumeSize: 100Gi
volumeType: gp3
iops: 3000
encrypted: true
IRSA Configuration
All platform components use IAM Roles for Service Accounts (IRSA) for least-privilege AWS access:
terraform/environments/production/irsa.tf
module "ebs_csi_irsa_role" {
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
role_name = "ebs-csi-driver-${local.cluster_name}"
attach_ebs_csi_policy = true
oidc_providers = {
ex = {
provider_arn = module.devopsgenie_cluster.oidc_provider_arn
namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]
}
}
}
Post-Deployment Verification
# Confirm all nodes are Ready
kubectl get nodes -o wide
# Verify all system pods are Running
kubectl get pods -n kube-system
kubectl get pods -n karpenter
# Test Karpenter by deploying an inflate workload
kubectl apply -f - <<EOF
apiVersion: apps/v1
kind: Deployment
metadata:
name: inflate
spec:
replicas: 5
selector:
matchLabels:
app: inflate
template:
metadata:
labels:
app: inflate
spec:
containers:
- name: inflate
image: public.ecr.aws/eks-distro/kubernetes/pause:3.7
resources:
requests:
cpu: "1"
EOF
# Watch Karpenter provision new nodes
kubectl get nodes --watch
tip
Run devopsgenie cluster health for a complete health check across all cluster components.