flowchart TD
subgraph account["cloud-infra-account-terraform"]
BA["bootstrap-account\nIAM doormat role\n(write to run.hashicorp.services)"]
HCP["hcp-account\nPrivateLink principal allow\n+ Route53 ARC control panels"]
end
subgraph region["cloud-infra-region-terraform (per region)"]
DNS["dns.tf\npublic zone:\npurpose-env.airport.run.hashicorp.services\n+ NS delegation → bootstrap account"]
end
subgraph network["cloud-infra-network-terraform"]
VPC["vpc workspace\nprivate hosted zone:\nregion.aws.hashicorp.cloud\n→ output: regional_domain_dns_zone_id"]
end
subgraph stack["cloud-infra-stack-terraform (per stack)"]
CU["cluster-utils\nDeploys cluster-init Lambda\ninto hashistack_services subnets\nenv: BOOTSTRAP_VAULT_ADDR"]
subgraph consul_ws["consul workspace"]
CONSUL_SG["Security group\nallow egress :53 + :8600"]
CONSUL_EP_LOCAL["R53 outbound endpoint\nhashistack-r53-resolver\n(hashistack_services subnets)"]
CONSUL_EP_TRANSIT["R53 outbound endpoint\ntransit-r53-resolver\n(transit subnets)\nmulti_region=true only"]
CONSUL_RULE_LOCAL["Resolver rules\n'consul' + 'region.consul'\n→ transit NLB IPs :8600"]
CONSUL_RULE_REMOTE["Resolver rule\n'peer-region.consul'\n→ cidrhost(peer.cidr,2)\nmulti_region=true only"]
CONSUL_NLB["Transit NLB\nport 8600 TCP_UDP\n→ Consul server nodes"]
end
subgraph vault_ws_primary["vault workspace (PRIMARY region)"]
VP_CLUSTER["App Vault EC2 cluster\nmodule.app_vault_cluster"]
VP_DNS_PRIV["R53 A record (private)\nvault.region.aws.hashicorp.cloud\n→ private ingress ALB"]
VP_DNS_PUB["R53 A record (public)\nvault.purpose-env.airport.run...\n→ private ingress ALB"]
VP_LAMBDA["aws_lambda_invocation\ninitialize_app_vault\nvault_data: { address, replication_enabled }\nNO performance_replica (is_primary)"]
end
subgraph vc_primary["vault-config workspace (PRIMARY region)"]
VC_POLICY["vault-replication module\npolicy: sys/replication/performance/primary/*\n+ IAM auth role: cluster-init-replication"]
VC_NOMAD["module.nomad-token-backend\nnomad_address: nomad.service.consul:4646 ⚠️"]
end
subgraph vault_ws_secondary["vault workspace (SECONDARY region)"]
VS_CLUSTER["App Vault EC2 cluster\nmodule.app_vault_cluster"]
VS_DNS_PRIV["R53 A record (private)\nvault.peer-region.aws.hashicorp.cloud\n→ private ingress ALB"]
VS_LAMBDA["aws_lambda_invocation\ninitialize_app_vault\nvault_data: {\n performance_replica: {\n primary_address:\n active.vault.service.primary-region.consul:8200 ⚠️\n auth.aws_role: cluster-init-replication\n }\n}\nlifecycle_scope=CRUD, no ignore_changes"]
end
subgraph boot["cluster-init Lambda execution (cloud-infra-cluster-init/vault/init.go)"]
BOOT1["Lambda invoked by Terraform\naws_lambda_invocation (lifecycle_scope=CRUD)"]
BOOT2["Connects to new Vault cluster\nat request.Address (regional domain)"]
BOOT3["vault/init.go: Init()\nChecks health → initializes → waits for unseal\nwaits for Raft cluster healthy"]
BOOT4["isPerformanceReplica() == true\n→ enableSecondaryReplication()"]
BOOT5["generate-public-key on LOCAL secondary"]
BOOT6["fetchPrimaryReplicationToken()\nclient.NewVaultClient(PrimaryAddress)\n⚠️ DNS: active.vault.service.primary-region.consul:8200\nVaultLogin (aws, cluster-init-replication role)\nWrites sys/replication/performance/primary/secondary-token"]
BOOT7["initSecondary()\nWrites sys/replication/performance/secondary/enable\nwith token"]
BOOT8["restartVault()\nSSM RunShellScript → systemctl restart vault\non all cluster-id tagged instances"]
BOOT9["Re-authenticates post-restart\nretries with backoff"]
BOOT_P["enablePrimaryReplication() [PRIMARY path]\nWrites sys/replication/performance/primary/enable\nprimary_cluster_addr:\n⚠️ active.vault.service.region.consul:8201\n(port 8201 = Vault cluster port, NOT behind ALB)"]
end
end
%% Infrastructure dependencies
BA -->|"doormat role ARN\n→ var.bootstrap_dns_role"| DNS
DNS -->|"output: public_domain\n(zone id+name)"| VP_DNS_PUB
VPC -->|"output: regional_domain_dns_zone_id"| VP_DNS_PRIV
VPC -->|"output: regional_domain_dns_zone_id"| VS_DNS_PRIV
HCP -->|"PrivateLink access\nto bootstrap Vault"| CU
%% Consul resolver chain
CONSUL_SG --> CONSUL_EP_LOCAL
CONSUL_SG --> CONSUL_EP_TRANSIT
CONSUL_EP_LOCAL --> CONSUL_RULE_LOCAL
CONSUL_EP_TRANSIT --> CONSUL_RULE_REMOTE
CONSUL_NLB --> CONSUL_RULE_LOCAL
%% Primary vault setup
CU -->|"Lambda deployed before\nvault invokes it"| VP_LAMBDA
VP_CLUSTER -->|"depends_on"| VP_LAMBDA
VP_DNS_PRIV --> VP_DNS_PUB
%% vault-config primary prereq for replication
VP_LAMBDA -->|"primary initialized\nbefore config runs"| VC_POLICY
%% Secondary vault setup
CU -->|"same Lambda\nused by secondary"| VS_LAMBDA
VS_CLUSTER -->|"depends_on"| VS_LAMBDA
VC_POLICY -->|"cluster-init-replication role\nmust exist on primary\nbefore secondary activates"| VS_LAMBDA
%% DNS dependency for secondary
CONSUL_RULE_REMOTE -.->|"resolves\nactive.vault.service.primary-region.consul:8200\ninside Lambda at invocation time"| BOOT6
CONSUL_RULE_LOCAL -.->|"resolves\nactive.vault.service.region.consul:8201\nbaked into primary Vault state"| BOOT_P
%% Lambda execution flow (secondary)
VS_LAMBDA -->|"Terraform invokes Lambda"| BOOT1
BOOT1 --> BOOT2
BOOT2 --> BOOT3
BOOT3 --> BOOT4
BOOT4 --> BOOT5
BOOT5 --> BOOT6
BOOT6 --> BOOT7
BOOT7 --> BOOT8
BOOT8 --> BOOT9
%% Primary path
VP_LAMBDA -->|"Terraform invokes Lambda\n(no performance_replica)"| BOOT_P
%% Nomad consul dependency
CONSUL_RULE_LOCAL -.->|"resolves nomad.service.consul\nat vault-config apply time"| VC_NOMAD
classDef warning fill:#ff9900,color:#000
classDef infra fill:#232f3e,color:#fff
classDef dns fill:#1a6b3c,color:#fff
class VS_LAMBDA,VC_NOMAD warning
class BA,HCP,VPC infra
class DNS,VP_DNS_PRIV,VP_DNS_PUB,VS_DNS_PRIV,CONSUL_RULE_LOCAL,CONSUL_RULE_REMOTE dns
| Phase | What happens |
|---|---|
| 1 | account-terraform — IAM doormat role exists in bootstrap account |
| 2 | region-terraform — public zone created + NS delegated |
| 3 | network-terraform (vpc) — private hosted zone created, zone ID output |
| 4 | cluster-utils — cluster-init Lambda deployed into VPC |
| 5 | consul — R53 resolver endpoints + rules up; .consul DNS now works in VPC |
| 6 | vault (primary) — cluster stood up, DNS records created, Lambda invoked (no replica config), primary initialized via Ansible |
| 7 | vault-config (primary) — cluster-init-replication policy + IAM auth role written to primary Vault |
| 8 | vault (secondary) — cluster stood up, Lambda invoked with performance_replica.primary_address = active.vault.service.<primary-region>.consul:8200 — requires step 5 (cross-region resolver) and step 7 (replication role on primary) |
| 9 | cluster-init Lambda (secondary path) — vault/init.go: enableSecondaryReplication(): generates public key → connects to primary via .consul:8200 → fetches secondary token → enables secondary → SSM-restarts all vault nodes → re-authenticates |
| 10 | Ongoing replication stream — primary Vault uses active.vault.service.<region>.consul:8201 (baked in at primary enable time) for node-to-node replication traffic; requires local R53 resolver permanently |
The Lambda is a Go binary. The replication handoff happens entirely inside vault/init.go:
Init()
└─ isPerformanceReplica()? (true if PerformanceReplica struct is non-empty)
├─ YES → enableSecondaryReplication()
│ 1. POST sys/replication/performance/secondary/generate-public-key (local)
│ 2. fetchPrimaryReplicationToken()
│ └─ client.NewVaultClient(PrimaryAddress)
│ ⚠️ DNS: active.vault.service.<primary-region>.consul:8200
│ └─ VaultLogin (aws auth, cluster-init-replication role)
│ └─ POST sys/replication/performance/primary/secondary-token (PRIMARY)
│ 3. POST sys/replication/performance/secondary/enable (local)
│ 4. restartVault() via SSM RunShellScript on all cluster-id instances
│ 5. Re-authenticate post-restart (retry with backoff)
└─ NO → enablePrimaryReplication()
└─ POST sys/replication/performance/primary/enable
primary_cluster_addr:
⚠️ active.vault.service.<region>.consul:8201
| Location | DNS name | Port | Resolver needed | Replaceable? |
|---|---|---|---|---|
cluster-init/vault/init.go: fetchPrimaryReplicationToken |
active.vault.service.<primary-region>.consul |
8200 (API) | Cross-region R53 resolver chain: remote rule → TGW → peer VPC +2 → peer Consul NLB :8600 | ✅ Yes — vault.<region>.aws.hashicorp.cloud resolves via private hosted zone; ALB handles active-node routing |
cluster-init/vault/init.go: enablePrimaryReplication |
active.vault.service.<region>.consul |
8201 (cluster) | Local R53 resolver: local rule → transit NLB :8600 | |
vault-config/terraform/locals.tf → module.nomad-token-backend |
nomad.service.consul |
4646 | Local R53 resolver | ✅ Yes — swap for nomad.<region>.aws.hashicorp.cloud if that A record exists |
Easy wins (resolver endpoints not required):
active.vault.service.<region>.consul:8200→vault.<region>.aws.hashicorp.cloud:8200
Change in_config/stack/component_vault.tf:21. ALB already routes to active node.nomad.service.consul:4646→nomad.<region>.aws.hashicorp.cloud:4646
Change invault-config/terraform/locals.tf:8.
Both would resolve via the private hosted zone using aws_route53_zone_association — free, no ENIs.
Hard/blocked:
active.vault.service.<region>.consul:8201(primary cluster addr) — this is baked into
Vault's internal replication state and used for the ongoing replication stream between
primary and secondary nodes. Port 8201 bypasses the ALB entirely. Replacing this would
require a dedicated NLB for Vault cluster traffic on :8201, or accepting that the R53
resolver is permanently load-bearing for replication stream maintenance.
Net conclusion: The R53 resolver endpoints cannot be fully eliminated today, but the
cross-region resolver endpoint (transit-r53-resolver, the expensive one) could be
removed if the :8200 lookup is swapped. The local resolver endpoint
(hashistack-r53-resolver) remains required for the :8201 cluster addr and for
runtime Consul service discovery by Nomad jobs.