bouroo · July 8, 2025 08:49
diff --git a/60-sysctl.conf b/60-sysctl.conf
 # /etc/sysctl.d/60-sysctl.conf
 # Generic Web + DB Server Tuning
 # Based on original by Kawin Viriyaprasopsook <[email protected]>
 # Apply with: sysctl --system

 ########################
 # Kernel & Memory
 ########################
 # Reduce console noise from kernel messages
 kernel.printk               = 3 4 1 3

 # How aggressively the kernel will swap memory pages.
 # Lower values tell the kernel to prefer dropping caches over swapping.
 # 10 is a good starting point for DBs and general servers.
 vm.swappiness               = 10

 # Maximum percentage of total system memory that can hold dirty pages
 # before processes are forced to write data synchronously.
 # 15% is reasonable with moderately fast storage. Consider lower (e.g., 10%) for slower disks.
 vm.dirty_ratio              = 15

 # Percentage of total system memory where background kernel flusher threads
 # will start writing dirty data to disk.
 vm.dirty_background_ratio   = 5

 # '1' means the kernel will always approve memory requests, potentially overcommitting.
 # Useful for applications like databases that might allocate large buffers upfront.
 # Monitor for OOM killer activity if memory is tight.
 vm.overcommit_memory        = 1

 # System-wide limit for open file handles.
 # 4,194,304 is very generous. 2,097,152 is also often sufficient.
 # Good for web servers (many sockets) and DBs (many data files, connections).
 fs.file-max                 = 4194304

 # Maximum number of processes/threads. Useful for web servers with many workers.
 # Default is often 32768.
 kernel.pid_max              = 65536

 # Minimum amount of free RAM (in KB) the system should maintain.
 # Prevents system from running completely out of memory for critical operations.
 # 128MB is a common value. Increase for systems with very large RAM (e.g. 256MB for 128GB+ RAM)
 vm.min_free_kbytes          = 131072

 ########################
 # Network Core, Buffers & Qdisc
 ########################
 # Default queuing discipline. fq_codel is excellent for reducing bufferbloat and ensuring fairness.
 # If using BBR for TCP congestion control, 'fq' is its canonical partner, but fq_codel is also very good.
 net.core.default_qdisc      = fq_codel # or 'fq'

 # Maximum number of packets queued on the input side of a network interface
 # when the interface receives packets faster than the kernel can process them.
 # Good for 1GbE+ NICs under high load.
 net.core.netdev_max_backlog = 5000

 # Maximum number of connection requests queued for a listening socket (listen() backlog).
 # Crucial for web servers handling many incoming connections.
 # 1024 is okay, but higher values like 4096 or 8192 are common for busy web servers.
 net.core.somaxconn          = 4096

 # Default and maximum socket receive buffer size (bytes).
 # Linux auto-tuning is generally good. Moderated values are often better for general servers.
 net.core.rmem_default       = 262144   # 256KB
 net.core.wmem_default       = 262144   # 256KB
 net.core.rmem_max           = 16777216 # 16MB
 net.core.wmem_max           = 16777216 # 16MB
 # Note: net.ipv4.tcp_rmem/wmem will override these for TCP sockets.

 ########################
 # TCP Tuning
 ########################
 # Enable TCP SYN cookies to help protect against SYN flood attacks.
 net.ipv4.tcp_syncookies     = 1

 # Maximum number of remembered connection requests not yet acknowledged (SYN_RECV state).
 # Increase for busy web servers.
 net.ipv4.tcp_max_syn_backlog= 4096

 # Time to hold a socket in FIN-WAIT-2 state. Lowering helps free up socket resources faster.
 net.ipv4.tcp_fin_timeout    = 20

 # Allow reusing sockets in TIME-WAIT state for new outgoing connections.
 # Requires tcp_timestamps=1. Useful if the server makes many outgoing connections.
 net.ipv4.tcp_tw_reuse       = 1

 # Enable TCP Timestamps. Required for tcp_tw_reuse and helps with RTT estimation.
 net.ipv4.tcp_timestamps     = 1

 # DO NOT ENABLE tcp_tw_recycle. It's problematic with NAT and removed in kernels >= 4.12.
 # Setting to 0 ensures it's off or does nothing on newer ones.
 net.ipv4.tcp_tw_recycle     = 0

 # TCP Keepalive settings: Detect dead connections.
 net.ipv4.tcp_keepalive_time = 600      # Send first keepalive after 10 mins of idleness
 net.ipv4.tcp_keepalive_intvl= 60       # Send subsequent probes every 60 seconds
 net.ipv4.tcp_keepalive_probes= 5       # Declare connection dead after 5 failed probes

 # Range of ephemeral ports for outgoing connections. Expands the available pool.
 # Max is 65535.
 net.ipv4.ip_local_port_range= 1024 65535

 # TCP receive and send buffer sizes (min, default, max in bytes).
 # These override net.core.rmem/wmem_default/max for TCP.
 net.ipv4.tcp_rmem           = 4096 87380 16777216  # Max 16MB
 net.ipv4.tcp_wmem           = 4096 65536 16777216  # Max 16MB

 # TCP congestion control algorithm. BBR often improves throughput and latency.
 # Ensure 'bbr' module is available and loaded.
 net.ipv4.tcp_congestion_control = bbr

 # Disable restarting TCP slow start after an idle period.
 # Can improve performance for connections that are idle then burst data (common for web).
 net.ipv4.tcp_slow_start_after_idle = 0

 # Enable Path MTU discovery probing. '1' enables after ICMP black hole detection.
 net.ipv4.tcp_mtu_probing    = 1

 # Do not save TCP metrics from closed connections. Can be useful if routes/paths change.
 net.ipv4.tcp_no_metrics_save= 1

 ########################
 # Conntrack (Netfilter Connection Tracking)
 # Important if the server is behind a firewall, acts as a firewall/NAT,
 # or uses local firewall rules extensively.
 # Adjust based on actual usage (cat /proc/sys/net/netfilter/nf_conntrack_count)
 # Each entry uses ~300 bytes of non-swappable kernel memory.
 ########################
 net.netfilter.nf_conntrack_max                 = 524288   # Max entries (e.g., ~150MB RAM)
 net.netfilter.nf_conntrack_tcp_timeout_established = 7200   # 2 hours for established TCP
 net.netfilter.nf_conntrack_tcp_timeout_time_wait  = 120    # Must be >= tcp_fin_timeout
 net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
 net.netfilter.nf_conntrack_tcp_timeout_fin_wait   = 120

 ########################
 # Security & Misc
 ########################
 # Enable strict reverse path filtering to prevent IP spoofing
 net.ipv4.conf.all.rp_filter       = 1
 net.ipv4.conf.default.rp_filter   = 1

 # Disable acceptance of ICMP redirect messages (potential MITM vector)
 net.ipv4.conf.all.accept_redirects= 0
 net.ipv4.conf.default.accept_redirects= 0

 # Disable acceptance of source routed packets (security risk)
 net.ipv4.conf.all.accept_source_route = 0
 net.ipv4.conf.default.accept_source_route = 0

 # Log packets with impossible source addresses (martians)
 net.ipv4.conf.all.log_martians = 1
 net.ipv4.conf.default.log_martians = 1

 # For IPv6, if actively used:
 # net.ipv6.conf.all.accept_redirects = 0
 # net.ipv6.conf.default.accept_redirects = 0
 # net.ipv6.conf.all.accept_source_route = 0
 # net.ipv6.conf.default.accept_source_route = 0
diff --git a/61-sysctl-old-os.conf b/61-sysctl-old-os.conf
 # /etc/sysctl.d/60-sysctl.conf
 # Optimized Generic Web + DB Server Tuning for CentOS 6 (kernel 2.6.32)
 # Based on original by Kawin Viriyaprasopsook <[email protected]>
 # Apply with: sysctl -p /etc/sysctl.d/60-sysctl.conf
 # or reboot. To apply to current system without reboot: sysctl --system

 ########################
 # Kernel & Memory / I/O Writeback
 ########################
 # Reduce console noise from kernel messages
 kernel.printk               = 3 4 1 3

 # How aggressively the kernel will swap memory pages.
 # Lower values tell the kernel to prefer dropping caches over swapping.
 # 10 is a good starting point for DBs and general servers.
 vm.swappiness               = 10

 # Tend to keep dentry/inode caches longer, good for frequent file access (DB data files, web static content).
 vm.vfs_cache_pressure       = 50

 # Maximum percentage of total system memory that can hold dirty pages
 # before processes are forced to write data synchronously.
 # 15% is reasonable with moderately fast storage. For very slow disks, consider 10%.
 vm.dirty_ratio              = 15

 # Percentage of total system memory where background kernel flusher threads
 # will start writing dirty data to disk.
 vm.dirty_background_ratio   = 5

 # Minimum amount of free RAM (in KB) the system should maintain.
 # 64MB is a reasonable minimum for systems of this era.
 vm.min_free_kbytes          = 65536

 # '1' means the kernel will always approve memory requests, potentially overcommitting.
 # Useful for applications like databases that might allocate large buffers upfront.
 # Monitor for OOM killer activity if memory is tight.
 vm.overcommit_memory        = 1

 ########################
 # File Handles & Tasks
 ########################
 # System-wide limit for open file handles.
 # Good for web servers (many sockets) and DBs (many data files, connections).
 fs.file-max                 = 2097152

 # Maximum number of processes/threads. Useful for web servers with many workers.
 # Default is often 32768.
 kernel.pid_max              = 65536

 ########################
 # Network Core, Buffers & Queues
 ########################
 # Maximum number of connection requests queued for a listening socket (listen() backlog).
 # Crucial for web servers handling many incoming connections.
 # Increased from original 1024.
 net.core.somaxconn          = 4096

 # Maximum number of packets queued on the input side of a network interface
 # when the interface receives packets faster than the kernel can process them.
 # Good for 1GbE NICs under high load.
 net.core.netdev_max_backlog = 5000

 # Default socket receive buffer size (bytes).
 net.core.rmem_default       = 262144
 # Default socket send buffer size (bytes).
 net.core.wmem_default       = 262144
 # Maximum socket receive buffer size (bytes).
 net.core.rmem_max           = 16777216 # 16MB
 # Maximum socket send buffer size (bytes).
 net.core.wmem_max           = 16777216 # 16MB
 # Note: net.ipv4.tcp_rmem/wmem will override these for TCP sockets.

 ########################
 # TCP Tuning (Kernel 2.6.32 specific considerations)
 ########################
 # Enable TCP SYN cookies to help protect against SYN flood attacks.
 net.ipv4.tcp_syncookies     = 1

 # Maximum number of remembered connection requests not yet acknowledged (SYN_RECV state).
 # Increase for busy web servers. Should generally be >= somaxconn.
 net.ipv4.tcp_max_syn_backlog= 4096

 # Time to hold a socket in FIN-WAIT-2 state. Default is 60.
 # Lowering helps free up socket resources faster on busy web servers.
 net.ipv4.tcp_fin_timeout    = 25 # Slightly more conservative than 15-20 on newer kernels

 # Allow reusing sockets in TIME-WAIT state for new outgoing connections.
 # Requires tcp_timestamps=1. Useful if the server makes many outgoing connections.
 net.ipv4.tcp_tw_reuse       = 1

 # Enable TCP Timestamps. Required for tcp_tw_reuse and helps with RTT estimation.
 # Often on by default, but explicit is better.
 net.ipv4.tcp_timestamps     = 1

 # DO NOT ENABLE tcp_tw_recycle. It's problematic with NAT.
 # Setting to 0 ensures it's off.
 net.ipv4.tcp_tw_recycle     = 0

 # Range of ephemeral ports for outgoing connections. Expands the available pool.
 # Max is 65535.
 net.ipv4.ip_local_port_range= 1024 65535

 # TCP Keepalive settings: Detect dead connections.
 net.ipv4.tcp_keepalive_time = 600      # Send first keepalive after 10 mins of idleness
 net.ipv4.tcp_keepalive_intvl= 60       # Send subsequent probes every 60 seconds
 net.ipv4.tcp_keepalive_probes= 5       # Declare connection dead after 5 failed probes

 # TCP receive and send buffer sizes (min, default, max in bytes).
 # These override net.core.rmem/wmem_default/max for TCP.
 # Max values match net.core limits.
 net.ipv4.tcp_rmem           = 4096 87380 16777216
 net.ipv4.tcp_wmem           = 4096 65536 16777216

 # Enable TCP window scaling (RFC 1323). Usually on by default.
 net.ipv4.tcp_window_scaling = 1
 # Enable Selective Acknowledgements (SACK). Usually on by default.
 net.ipv4.tcp_sack           = 1

 # Enable TCP auto-tuning of receive buffer. Usually on by default.
 net.ipv4.tcp_moderate_rcvbuf= 1

 # TCP Congestion Control: CentOS 6 (kernel 2.6.32) defaults to 'cubic'.
 # 'bbr' is not available. 'cubic' is generally the best choice for this kernel.
 # No need to set it if it's already the default, but you could explicitly set:
 # net.ipv4.tcp_congestion_control = cubic

 # Disable restarting TCP slow start after an idle period.
 # Can improve performance for connections that are idle then burst data (common for web).
 net.ipv4.tcp_slow_start_after_idle = 0

 ########################
 # Conntrack (Netfilter Connection Tracking)
 # Important if the server has a local firewall (iptables) or handles NAT.
 # For CentOS 6, the parameter might be ip_conntrack_max if nf_conntrack module isn't dominant.
 # nf_conntrack is generally preferred if available and loaded.
 # Check with: lsmod | grep -E "nf_conntrack|ip_conntrack"
 # And then check existence of /proc/sys/net/netfilter/nf_conntrack_max or /proc/sys/net/ipv4/ip_conntrack_max
 ########################
 # Assuming nf_conntrack is in use:
 net.netfilter.nf_conntrack_max                 = 262144   # Max entries (e.g., ~75MB RAM)
 net.netfilter.nf_conntrack_tcp_timeout_established = 7200   # 2 hours for established TCP
 net.netfilter.nf_conntrack_tcp_timeout_time_wait  = 120    # Must be >= tcp_fin_timeout
 net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
 net.netfilter.nf_conntrack_tcp_timeout_fin_wait   = 120
 # If using the older ip_conntrack (less likely for later 2.6.32 but possible):
 # net.ipv4.ip_conntrack_max = 65536 # Example, adjust based on memory
 # net.ipv4.ip_conntrack_tcp_timeout_established = 7200

 ########################
 # Security & Hardening
 ########################
 # Enable strict reverse path filtering to prevent IP spoofing
 net.ipv4.conf.all.rp_filter       = 1
 net.ipv4.conf.default.rp_filter   = 1

 # Disable acceptance of ICMP redirect messages (potential MITM vector)
 net.ipv4.conf.all.accept_redirects= 0
 net.ipv4.conf.default.accept_redirects= 0

 # Disable acceptance of source routed packets (security risk)
 net.ipv4.conf.all.accept_source_route = 0
 net.ipv4.conf.default.accept_source_route = 0

 # Log packets with impossible source addresses (martians)
 net.ipv4.conf.all.log_martians = 1
 net.ipv4.conf.default.log_martians = 1

 ########################
 # IPv6 (Optional - Disable if not used)
 # Disabling IPv6 can slightly reduce kernel overhead and attack surface if unused.
 ########################
 # Uncomment the following lines if you are NOT using IPv6:
 # net.ipv6.conf.all.disable_ipv6   = 1
 # net.ipv6.conf.default.disable_ipv6 = 1
 # net.ipv6.conf.lo.disable_ipv6    = 1
diff --git a/80-k8s-ipvs.conf b/80-k8s-ipvs.conf
 # /etc/sysctl.d/80-k8s-ipvs-optimized.conf
 # Optimized Kubernetes Node Tuning for kube-proxy IPVS mode
 # Based on original by Kawin Viriyaprasopsook <[email protected]>
 # Apply with: sysctl --system

 ########################
 # System & Memory
 ########################
 # Reduce console noise from kernel messages
 kernel.printk                    = 3 4 1 3

 # System-wide limit for open file handles. High value is good for many containers.
 fs.file-max                      = 2097152

 # Maximum number of processes/threads. Increased for many pods/containers.
 kernel.pid_max                   = 65536
 # Consider also kernel.threads-max if pid_max is very aggressively increased.

 # Strongly prefer dropping caches over swapping out Pod/application memory.
 # Kubernetes workloads should ideally have memory requests/limits set to avoid swapping.
 vm.swappiness                    = 10 # Some use 1 or 0 for K8s nodes if RAM is plentiful.

 # Allow memory overcommit. '1' means the kernel will always approve memory requests.
 # Crucial for K8s to allow Pods to start, but ensure proper resource requests/limits
 # to prevent actual memory exhaustion leading to OOM killer activity.
 vm.overcommit_memory             = 1

 ########################
 # TCP Performance
 ########################
 # Enable TCP window scaling, selective acknowledgements, and timestamps. Standard for modern TCP.
 net.ipv4.tcp_window_scaling      = 1
 net.ipv4.tcp_sack                = 1
 net.ipv4.tcp_timestamps          = 1 # Needed for tcp_tw_reuse, also helps RTT estimation.

 # Enable TCP Fast Open (TFO). '3' means enable for client and server.
 # Can reduce latency for repeated connections if supported by client and server.
 net.ipv4.tcp_fastopen            = 3

 # Max SYN backlog. Increased to handle bursts of new connections to services.
 net.ipv4.tcp_max_syn_backlog     = 8192

 # Enable TCP SYN cookies to help protect against SYN flood attacks.
 net.ipv4.tcp_syncookies          = 1

 # Number of SYN retries before giving up. Lowering can speed up failure detection.
 # Default is often 5 or 6. For internal K8s traffic, 2-3 might be acceptable.
 net.ipv4.tcp_syn_retries         = 3
 # Number of SYN+ACK retries. Similar considerations.
 net.ipv4.tcp_synack_retries      = 3

 # Range of ephemeral ports for outgoing connections. Expands the available pool.
 net.ipv4.ip_local_port_range     = 1024 65535

 # TCP congestion control. BBR is good for throughput and latency.
 # Ensure 'bbr' module is available and loaded, and ideally use 'fq' qdisc.
 net.ipv4.tcp_congestion_control  = bbr

 # Disable restarting TCP slow start after an idle period.
 # Can improve performance for connections that are idle then burst data.
 net.ipv4.tcp_slow_start_after_idle = 0

 # Enable Path MTU discovery probing. '1' enables after ICMP black hole detection.
 net.ipv4.tcp_mtu_probing         = 1

 # Do not save TCP metrics from closed connections. Can be useful if routes/paths change.
 net.ipv4.tcp_no_metrics_save     = 1

 # Kernel attempts to aggregate small writes before sending (Nagle's algorithm related).
 # Default is usually 1.
 net.ipv4.tcp_autocorking         = 1

 # Socket send buffer low watermark (SO_SNDLOWAT). 16KB.
 # Helps ensure NIC has data to send, potentially improving throughput.
 net.ipv4.tcp_notsent_lowat       = 16384

 ########################
 # Network Buffers & Qdisc
 ########################
 # Default queuing discipline. fq_codel is excellent for fairness and reducing bufferbloat.
 # If using BBR, 'fq' is its canonical partner, but fq_codel is also a very strong choice.
 net.core.default_qdisc           = fq_codel # or 'fq' if strictly following BBR's recommendation

 # Maximum number of packets queued on the input side of a network interface.
 # Very high value, good for high-speed NICs (10GbE+) to prevent drops.
 net.core.netdev_max_backlog      = 30000

 # Maximum number of connection requests queued for a listening socket (listen() backlog)
 # Increased from original 1024 for better handling of connection bursts.
 net.core.somaxconn               = 4096

 # Default and maximum TCP receive/send buffer sizes.
 # Linux auto-tuning (net.ipv4.tcp_moderate_rcvbuf=1 by default) usually works well.
 # These are still generous values:
 net.core.rmem_default            = 1048576   # 1MB
 net.core.wmem_default            = 1048576   # 1MB
 net.core.rmem_max                = 16777216  # 16MB
 net.core.wmem_max                = 16777216  # 16MB

 ########################
 # Fragmentation & Time-Wait
 ########################
 # Memory limits for IP fragment reassembly (bytes). Increased for robustness.
 net.ipv4.ipfrag_high_thresh      = 4194304  # 4MB
 net.ipv4.ipfrag_low_thresh       = 3145728  # 3MB
 # Timeout for reassembling fragments (seconds).
 net.ipv4.ipfrag_time             = 30

 # Time to hold a socket in FIN-WAIT-2 state. Lowering helps free up socket resources faster.
 net.ipv4.tcp_fin_timeout         = 15

 # tcp_tw_recycle: DO NOT ENABLE. Problematic with NAT, removed in kernels >= 4.12.
 # Setting to 0 ensures it's off or does nothing on newer kernels.
 net.ipv4.tcp_tw_recycle          = 0

 # tcp_tw_reuse: Allow reusing sockets in TIME-WAIT state for new OUTGOING connections.
 # Requires tcp_timestamps=1. Generally safe and beneficial for nodes making many
 # outgoing connections (e.g., to backend Pods).
 net.ipv4.tcp_tw_reuse            = 1

 # Maximum number of sockets in TIME-WAIT state.
 # High value for K8s due to high connection churn. Monitor actual usage.
 net.ipv4.tcp_max_tw_buckets      = 1440000 # Default is often much lower.

 # TCP Keepalive settings: Detect dead connections.
 net.ipv4.tcp_keepalive_time      = 600  # Send first keepalive after 10 mins of idleness
 net.ipv4.tcp_keepalive_intvl     = 60   # Send subsequent probes every 60 seconds
 net.ipv4.tcp_keepalive_probes    = 5    # Declare connection dead after 5 failed probes

 ########################
 # Routing & Bridge/NF (Essential for K8s)
 ########################
 # Enable IP forwarding. Critical for K8s nodes.
 net.ipv4.ip_forward              = 1

 # Pass bridged IPv4/IPv6 traffic through netfilter (iptables/nftables) hooks.
 # CRITICAL for K8s CNI plugins and network policies to function correctly,
 # even when IPVS is used for service load balancing.
 net.bridge.bridge-nf-call-iptables  = 1
 net.bridge.bridge-nf-call-ip6tables = 1

 # For IPv6 if used in the cluster:
 # net.ipv6.conf.all.forwarding = 1
 # net.ipv6.conf.default.forwarding = 1

 ########################
 # IPVS (kube-proxy specific settings)
 ########################
 # Enable IPVS connection tracking integration with netfilter conntrack.
 # This is often required for network policies (Calico, Cilium, etc.) to correctly
 # see and filter IPVS-handled traffic.
 net.ipv4.vs.conntrack               = 1

 # Expire connections to a destination server when it's removed from the service.
 # Helps in faster convergence when backend Pods are deleted.
 net.ipv4.vs.expire_nodest_conn      = 1

 # Expire persistent templates when a destination server is quiesced (weight 0).
 # Useful for graceful backend removal.
 net.ipv4.vs.expire_quiescent_template = 1

 # Be more lenient with TCP state transitions for IPVS. Can help with some clients/NAT.
 net.ipv4.vs.sloppy_tcp               = 1
 # Be more lenient with UDP "connections" for IPVS.
 net.ipv4.vs.sloppy_udp               = 1

 ########################
 # Conntrack (Netfilter Connection Tracking)
 # Values depend heavily on cluster size, traffic patterns, and available RAM.
 # Each conntrack entry uses ~300 bytes of non-swappable kernel memory.
 ########################
 # Maximum number of connection tracking entries.
 # 1,048,576 * ~300 bytes = ~300MB RAM. Adjust based on monitoring.
 # Monitor with: cat /proc/sys/net/netfilter/nf_conntrack_count
 net.netfilter.nf_conntrack_max               = 1048576

 # Timeout for established TCP connections (seconds). Default is 5 days (432000).
 # 12 hours (43200) can help recycle entries faster but might drop long-lived idle connections.
 net.netfilter.nf_conntrack_tcp_timeout_established = 43200

 # Shorter timeouts for TCP connections in closing states.
 net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
 net.netfilter.nf_conntrack_tcp_timeout_fin_wait   = 60
 # TIME_WAIT timeout in conntrack should generally align with net.ipv4.tcp_fin_timeout
 # or be slightly longer. Default is 120s.
 net.netfilter.nf_conntrack_tcp_timeout_time_wait  = 120

 # Timeout for generic (non-TCP/UDP/ICMP) protocol entries.
 net.netfilter.nf_conntrack_generic_timeout        = 60

 ########################
 # Security Hardening
 ########################
 # Enable strict reverse path filtering to prevent IP spoofing
 net.ipv4.conf.all.rp_filter         = 1
 net.ipv4.conf.default.rp_filter     = 1 # Set to 2 (loose) if asymmetric routing is used on the node itself.

 # Disable acceptance of ICMP redirect messages (potential MITM vector)
 net.ipv4.conf.all.accept_redirects  = 0
 net.ipv4.conf.default.accept_redirects = 0

 # Disable acceptance of source routed packets (security risk)
 net.ipv4.conf.all.accept_source_route  = 0
 net.ipv4.conf.default.accept_source_route = 0

 # Log packets with impossible source addresses (martians)
 net.ipv4.conf.all.log_martians           = 1
 net.ipv4.conf.default.log_martians       = 1

 # Ignore ICMP echo requests to broadcast/multicast addresses
 net.ipv4.icmp_echo_ignore_broadcasts     = 1

 # Ignore all ICMP timestamp requests (minor security hardening)
 net.ipv4.icmp_timestamp_ignore_all       = 1

 ########################
 # ARP Cache (Potentially useful for K8s with many pods/services on same L2)
 ########################
 # Adjust ARP cache garbage collection thresholds if you have a very large number of
 # active IP addresses on the same L2 network as the node.
 # Threshold1: Soft limit, GC starts here.
 # Threshold2: Hard limit, GC becomes more aggressive.
 # Threshold3: Absolute max, entries might be dropped.
 # Defaults are often 128, 512, 1024. Increase if ARP cache overflows are suspected.
 # net.ipv4.neigh.default.gc_thresh1 = 1024
 # net.ipv4.neigh.default.gc_thresh2 = 2048
 # net.ipv4.neigh.default.gc_thresh3 = 4096
diff --git a/80-k8s.conf b/80-k8s.conf
 # /etc/sysctl.d/80-k8s.conf
 # Optimized Kubernetes Node Network Tuning for kube-proxy native nftables mode
 # Based on original by Kawin Viriyaprasopsook <[email protected]>
 # Apply with: sysctl --system

 ########################
 # Kernel & Memory
 ########################
 # Reduce console noise from kernel messages
 kernel.printk               = 3 4 1 3

 # Strongly prefer dropping caches over swapping out Pod/application memory.
 # Kubernetes workloads should ideally have memory requests/limits set to avoid swapping.
 vm.swappiness               = 10 # Some even use 1 or 0 for K8s nodes if RAM is plentiful.

 # Allow memory overcommit. '1' means the kernel will always approve memory requests.
 # This can be useful for applications that request more memory than they immediately use.
 # However, ensure your K8s resource requests/limits are well-defined to prevent
 # actual memory exhaustion leading to OOM killer activity.
 vm.overcommit_memory        = 1

 # System-wide limit for open file handles. High value is good for many containers.
 fs.file-max                 = 2097152

 # Maximum number of processes/threads. Increased for many pods/containers.
 kernel.pid_max              = 65536
 # Consider also kernel.threads-max if pid_max is very aggressively increased,
 # though pid_max usually governs overall process/thread count.

 ########################
 # TCP Performance
 ########################
 # Enable TCP window scaling, selective acknowledgements, and timestamps.
 # These are standard for modern TCP and generally beneficial.
 net.ipv4.tcp_window_scaling = 1
 net.ipv4.tcp_sack           = 1
 net.ipv4.tcp_timestamps     = 1 # Needed for tcp_tw_reuse, also helps RTT estimation.

 # Enable TCP Fast Open (TFO). '3' means enable for client and server.
 # Can reduce latency for repeated connections if supported by client and server.
 net.ipv4.tcp_fastopen       = 3

 # Max SYN backlog. Increased to handle bursts of new connections to services.
 net.ipv4.tcp_max_syn_backlog= 8192

 # Enable TCP SYN cookies to help protect against SYN flood attacks.
 net.ipv4.tcp_syncookies     = 1

 # Number of SYN retries before giving up. Lowering can speed up failure detection
 # for unresponsive peers but might be too aggressive for lossy external networks.
 # Default is often 5 or 6. For internal K8s traffic, 2 might be acceptable.
 net.ipv4.tcp_syn_retries    = 2
 # Number of SYN+ACK retries. Similar considerations as tcp_syn_retries.
 net.ipv4.tcp_synack_retries = 2

 # Range of ephemeral ports for outgoing connections. Expands the available pool.
 net.ipv4.ip_local_port_range= 1024 65535

 # TCP congestion control. BBR is good for throughput and latency.
 # Ensure 'bbr' module is available and loaded, and ideally use 'fq' qdisc.
 net.ipv4.tcp_congestion_control = bbr

 # Disable restarting TCP slow start after an idle period.
 # Can improve performance for connections that are idle then burst data.
 net.ipv4.tcp_slow_start_after_idle = 0

 # Enable Path MTU discovery probing.
 net.ipv4.tcp_mtu_probing    = 1 # '0' (disabled), '1' (enabled only after ICMP black hole), '2' (always enabled)

 # Do not save TCP metrics from closed connections. Can be useful if routes/paths change.
 net.ipv4.tcp_no_metrics_save= 1

 # Kernel attempts to aggregate small writes before sending (Nagle's algorithm related).
 # Default is usually 1.
 net.ipv4.tcp_autocorking    = 1

 # Socket send buffer low watermark (SO_SNDLOWAT). 16KB.
 # This tells the kernel to try to ensure there's at least this much unsent data
 # available for the NIC to DMA, potentially improving throughput for some drivers/NICs.
 net.ipv4.tcp_notsent_lowat  = 16384

 ########################
 # Network Buffers & Qdisc
 ########################
 # Maximum number of packets queued on the input side of a network interface.
 # Very high value, good for high-speed NICs (10GbE+) to prevent drops.
 net.core.netdev_max_backlog= 30000

 # Default and maximum TCP receive/send buffer sizes.
 # Linux auto-tuning (net.ipv4.tcp_moderate_rcvbuf=1 by default) usually works well.
 # Setting very large fixed defaults can sometimes be counterproductive unless you have
 # specific high BDP (Bandwidth-Delay Product) paths that require them.
 # These are still generous values:
 net.core.rmem_default      = 1048576   # 1MB
 net.core.wmem_default      = 1048576   # 1MB
 net.core.rmem_max          = 16777216  # 16MB
 net.core.wmem_max          = 16777216  # 16MB

 # Default queuing discipline. fq_codel is excellent for fairness and reducing bufferbloat.
 # If using BBR, 'fq' is its canonical partner, but fq_codel is also a very strong choice.
 net.core.default_qdisc     = fq_codel

 ########################
 # IP Fragmentation
 ########################
 # Memory limits for IP fragment reassembly.
 # These allow buffering more fragments if heavy fragmentation occurs.
 # K8s networking (CNIs) generally tries to avoid fragmentation.
 net.ipv4.ipfrag_high_thresh= 4194304 # Increased from 262144 (4MB)
 net.ipv4.ipfrag_low_thresh = 3145728 # Increased from 196608 (3MB)
 # Timeout for reassembling fragments (seconds).
 net.ipv4.ipfrag_time       = 30

 ########################
 # TIME-WAIT & Keepalive
 ########################
 # tcp_tw_recycle: DO NOT ENABLE. Problematic with NAT, removed in kernels >= 4.12.
 # Setting to 0 ensures it's off or does nothing on newer kernels.
 net.ipv4.tcp_tw_recycle    = 0

 # tcp_tw_reuse: Allow reusing sockets in TIME-WAIT state for new OUTGOING connections.
 # Requires tcp_timestamps=1. Generally safe and beneficial for nodes making many
 # outgoing connections (e.g., to other services, external APIs).
 net.ipv4.tcp_tw_reuse      = 1

 # Maximum number of sockets in TIME-WAIT state.
 # High value for K8s due to high connection churn. Monitor actual usage.
 net.ipv4.tcp_max_tw_buckets= 1440000 # Default is often much lower.

 # TCP Keepalive settings: Detect dead connections.
 net.ipv4.tcp_keepalive_time= 600  # Send first keepalive after 10 mins of idleness
 net.ipv4.tcp_keepalive_intvl= 60   # Send subsequent probes every 60 seconds
 net.ipv4.tcp_keepalive_probes= 5   # Declare connection dead after 5 failed probes

 ########################
 # Security & ICMP
 ########################
 # Disable acceptance of ICMP redirect messages (potential MITM vector)
 net.ipv4.conf.all.accept_redirects       = 0
 net.ipv4.conf.default.accept_redirects   = 0

 # Disable acceptance of source routed packets (security risk)
 net.ipv4.conf.all.accept_source_route    = 0
 net.ipv4.conf.default.accept_source_route= 0

 # Enable strict reverse path filtering to prevent IP spoofing
 net.ipv4.conf.all.rp_filter              = 1
 net.ipv4.conf.default.rp_filter          = 1 # Set to 2 (loose) if asymmetric routing is used on the node itself.

 # Log packets with impossible source addresses (martians)
 net.ipv4.conf.all.log_martians           = 1

 # Ignore ICMP echo requests to broadcast/multicast addresses
 net.ipv4.icmp_echo_ignore_broadcasts     = 1

 # Ignore all ICMP timestamp requests (minor security hardening)
 net.ipv4.icmp_timestamp_ignore_all       = 1

 ########################
 # Routing & Bridge/nft (Essential for K8s)
 ########################
 # Enable IP forwarding. Critical for K8s nodes.
 net.ipv4.ip_forward                    = 1

 # Pass bridged IPv4/IPv6 traffic through netfilter (iptables/nftables) hooks.
 # CRITICAL for K8s CNI plugins and kube-proxy (even in nftables mode for its own rules)
 # to apply network policies, NAT, service routing, etc.
 net.bridge.bridge-nf-call-iptables     = 1
 net.bridge.bridge-nf-call-ip6tables    = 1
 # nftables uses the same L2 hooks as iptables for bridged traffic.

 # For IPv6 if used in the cluster:
 # net.ipv6.conf.all.forwarding = 1
 # net.ipv6.conf.default.forwarding = 1

 ########################
 # Conntrack (nftables mode relies on nf_conntrack)
 # Values depend heavily on cluster size, traffic patterns, and available RAM.
 # Each conntrack entry uses ~300 bytes of non-swappable kernel memory.
 ########################
 # Maximum number of connection tracking entries.
 # 1,048,576 * ~300 bytes = ~300MB RAM. Adjust based on monitoring.
 net.netfilter.nf_conntrack_max                 = 1048576
 # Monitor with: cat /proc/sys/net/netfilter/nf_conntrack_count

 # Timeout for established TCP connections (seconds). Default is 5 days (432000).
 # 12 hours (43200) can help recycle entries faster but might drop long-lived idle connections.
 net.netfilter.nf_conntrack_tcp_timeout_established = 43200

 # Shorter timeouts for TCP connections in closing states.
 net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
 net.netfilter.nf_conntrack_tcp_timeout_fin_wait   = 60
 net.netfilter.nf_conntrack_tcp_timeout_time_wait  = 120 # Should align with TCP TIME_WAIT

 # Timeout for generic (non-TCP/UDP/ICMP) protocol entries.
 net.netfilter.nf_conntrack_generic_timeout        = 60

 ########################
 # ARP Cache (Potentially useful for K8s with many pods/services)
 ########################
 # Adjust ARP cache garbage collection thresholds if you have a very large number of
 # active IP addresses on the same L2 network as the node.
 # Threshold1: Soft limit, GC starts here.
 # Threshold2: Hard limit, GC becomes more aggressive.
 # Threshold3: Absolute max, entries might be dropped.
 # Defaults are often 128, 512, 1024. Increase if ARP cache overflows are suspected.
 # net.ipv4.neigh.default.gc_thresh1 = 1024
 # net.ipv4.neigh.default.gc_thresh2 = 2048
 # net.ipv4.neigh.default.gc_thresh3 = 4096
diff --git a/80-pve.conf b/80-pve.conf
 # /etc/sysctl.d/80-pve.conf
 # Optimized Proxmox VE Host Tuning
 # Based on original by Kawin Viriyaprasopsook <[email protected]>
 # Apply with: sysctl --system

 ########################
 # Memory & VM Caching
 ########################
 # Reduce console noise from kernel messages
 kernel.printk               = 3 4 1 3

 # Strongly prefer dropping caches over swapping out application (VM) memory
 vm.swappiness               = 10

 # Tend to keep dentry/inode caches longer, good for frequent file access (VM disks)
 vm.vfs_cache_pressure       = 50

 # Max % of total memory for dirty pages before forcing synchronous writes
 vm.dirty_ratio              = 10
 # % of total memory for dirty pages before background kernel flusher threads start writing
 vm.dirty_background_ratio   = 5
 # For systems with very fast storage and lots of RAM, consider using
 # vm.dirty_bytes and vm.dirty_background_bytes for more absolute control.

 # Minimum amount of free RAM (in KB) the system should maintain (128MB)
 # Increase if you have very large amounts of RAM (e.g., 256GB+)
 vm.min_free_kbytes          = 131072

 # Maximum number of memory map areas a process can have
 # Increased for some containerized workloads (e.g., Elasticsearch) or complex applications
 vm.max_map_count            = 262144

 ########################
 # File Handles & PIDs
 ########################
 # System-wide limit for open file handles
 fs.file-max                 = 2097152

 # Maximum number of processes/threads the system can have
 # Increased from default (often 32768) for busy virtualization hosts
 kernel.pid_max              = 65536
 # Consider kernel.threads-max as well if pid_max is significantly increased,
 # though pid_max often covers thread limits too.

 ########################
 # Networking (Bridges & Forwarding)
 ########################
 # Enable IP forwarding for guest routing/NAT
 net.ipv4.ip_forward                 = 1

 # Pass bridged traffic through the host's iptables/ip6tables/arptables chains.
 # REQUIRED if using Proxmox VE firewall for VMs on bridges.
 # Set to 0 if NOT using PVE firewall for bridged VMs and security is handled elsewhere,
 # for a slight performance gain by bypassing host netfilter for bridged packets.
 net.bridge.bridge-nf-call-iptables  = 1
 net.bridge.bridge-nf-call-ip6tables = 1
 net.bridge.bridge-nf-call-arptables = 1

 ########################
 # TCP & Socket Queues
 ########################
 # Maximum number of connection requests queued for a listening socket (listen() backlog)
 net.core.somaxconn          = 4096

 # Maximum number of packets queued on the input side of a network interface
 # when the interface receives packets faster than the kernel can process them.
 # Good for 10GbE+ or busy 1GbE. Monitor for rx_dropped.
 net.core.netdev_max_backlog = 5000

 # Default and maximum TCP receive buffer size (bytes)
 net.core.rmem_default       = 262144
 net.core.rmem_max           = 16777216
 # Default and maximum TCP send buffer size (bytes)
 net.core.wmem_default       = 262144
 net.core.wmem_max           = 16777216

 # Maximum number of remembered connection requests not yet acknowledged (SYN_RECV state)
 net.ipv4.tcp_max_syn_backlog= 2048

 # Enable TCP SYN cookies to help protect against SYN flood attacks
 net.ipv4.tcp_syncookies     = 1

 # Time to hold a socket in FIN-WAIT-2 state. Default is 60.
 # Lowering helps free up socket resources faster on busy servers.
 net.ipv4.tcp_fin_timeout    = 15

 # Allow reusing sockets in TIME-WAIT state for new outgoing connections. Generally safe.
 net.ipv4.tcp_tw_reuse       = 1

 # DO NOT ENABLE tcp_tw_recycle. It's problematic with NAT and removed in kernels >= 4.12.
 # Setting to 0 ensures it's off if on an older kernel, or does nothing on newer ones.
 net.ipv4.tcp_tw_recycle     = 0

 # Range of ephemeral ports for outgoing connections. Expands the available pool.
 net.ipv4.ip_local_port_range= 1024 65535

 # TCP congestion control algorithm. BBR often improves throughput and latency.
 # Ensure 'bbr' module is available and loaded. 'cubic' is the older default.
 # Check availability: sysctl net.ipv4.tcp_available_congestion_control
 net.ipv4.tcp_congestion_control  = bbr

 ########################
 # Conntrack (Connection Tracking - for Netfilter/iptables)
 # Adjust based on actual usage (cat /proc/sys/net/netfilter/nf_conntrack_count)
 # Each entry uses ~300 bytes of non-swappable kernel memory.
 ########################
 # Maximum number of connection tracking entries.
 net.netfilter.nf_conntrack_max                 = 262144
 # How long to keep an established TCP connection in the tracking table (seconds, 1 day)
 net.netfilter.nf_conntrack_tcp_timeout_established = 86400
 # Timeout for generic (non-TCP/UDP/ICMP) protocol entries (seconds, 5 minutes)
 net.netfilter.nf_conntrack_generic_timeout        = 300

 ########################
 # Security Hardening
 ########################
 # Enable strict reverse path filtering to prevent IP spoofing
 net.ipv4.conf.all.rp_filter       = 1
 net.ipv4.conf.default.rp_filter   = 1

 # Disable acceptance of ICMP redirect messages (potential MITM vector)
 net.ipv4.conf.all.accept_redirects= 0
 net.ipv4.conf.default.accept_redirects= 0

 # Disable acceptance of source routed packets (security risk)
 net.ipv4.conf.all.accept_source_route = 0
 net.ipv4.conf.default.accept_source_route = 0

 # Log packets with impossible addresses (martians)
 net.ipv4.conf.all.log_martians = 1
 net.ipv4.conf.default.log_martians = 1

 # For IPv6, similar hardening can be applied if IPv6 is actively used:
 # net.ipv6.conf.all.accept_redirects = 0
 # net.ipv6.conf.default.accept_redirects = 0
 # net.ipv6.conf.all.accept_source_route = 0
 # net.ipv6.conf.default.accept_source_route = 0
	# /etc/sysctl.d/60-sysctl.conf
	# Generic Web + DB Server Tuning
	# Based on original by Kawin Viriyaprasopsook <[email protected]>
	# Apply with: sysctl --system

	########################
	# Kernel & Memory
	########################
	# Reduce console noise from kernel messages
	kernel.printk = 3 4 1 3

	# How aggressively the kernel will swap memory pages.
	# Lower values tell the kernel to prefer dropping caches over swapping.
	# 10 is a good starting point for DBs and general servers.
	vm.swappiness = 10

	# Maximum percentage of total system memory that can hold dirty pages
	# before processes are forced to write data synchronously.
	# 15% is reasonable with moderately fast storage. Consider lower (e.g., 10%) for slower disks.
	vm.dirty_ratio = 15

	# Percentage of total system memory where background kernel flusher threads
	# will start writing dirty data to disk.
	vm.dirty_background_ratio = 5

	# '1' means the kernel will always approve memory requests, potentially overcommitting.
	# Useful for applications like databases that might allocate large buffers upfront.
	# Monitor for OOM killer activity if memory is tight.
	vm.overcommit_memory = 1

	# System-wide limit for open file handles.
	# 4,194,304 is very generous. 2,097,152 is also often sufficient.
	# Good for web servers (many sockets) and DBs (many data files, connections).
	fs.file-max = 4194304

	# Maximum number of processes/threads. Useful for web servers with many workers.
	# Default is often 32768.
	kernel.pid_max = 65536

	# Minimum amount of free RAM (in KB) the system should maintain.
	# Prevents system from running completely out of memory for critical operations.
	# 128MB is a common value. Increase for systems with very large RAM (e.g. 256MB for 128GB+ RAM)
	vm.min_free_kbytes = 131072

	########################
	# Network Core, Buffers & Qdisc
	########################
	# Default queuing discipline. fq_codel is excellent for reducing bufferbloat and ensuring fairness.
	# If using BBR for TCP congestion control, 'fq' is its canonical partner, but fq_codel is also very good.
	net.core.default_qdisc = fq_codel # or 'fq'

	# Maximum number of packets queued on the input side of a network interface
	# when the interface receives packets faster than the kernel can process them.
	# Good for 1GbE+ NICs under high load.
	net.core.netdev_max_backlog = 5000

	# Maximum number of connection requests queued for a listening socket (listen() backlog).
	# Crucial for web servers handling many incoming connections.
	# 1024 is okay, but higher values like 4096 or 8192 are common for busy web servers.
	net.core.somaxconn = 4096

	# Default and maximum socket receive buffer size (bytes).
	# Linux auto-tuning is generally good. Moderated values are often better for general servers.
	net.core.rmem_default = 262144 # 256KB
	net.core.wmem_default = 262144 # 256KB
	net.core.rmem_max = 16777216 # 16MB
	net.core.wmem_max = 16777216 # 16MB
	# Note: net.ipv4.tcp_rmem/wmem will override these for TCP sockets.

	########################
	# TCP Tuning
	########################
	# Enable TCP SYN cookies to help protect against SYN flood attacks.
	net.ipv4.tcp_syncookies = 1

	# Maximum number of remembered connection requests not yet acknowledged (SYN_RECV state).
	# Increase for busy web servers.
	net.ipv4.tcp_max_syn_backlog= 4096

	# Time to hold a socket in FIN-WAIT-2 state. Lowering helps free up socket resources faster.
	net.ipv4.tcp_fin_timeout = 20

	# Allow reusing sockets in TIME-WAIT state for new outgoing connections.
	# Requires tcp_timestamps=1. Useful if the server makes many outgoing connections.
	net.ipv4.tcp_tw_reuse = 1

	# Enable TCP Timestamps. Required for tcp_tw_reuse and helps with RTT estimation.
	net.ipv4.tcp_timestamps = 1

	# DO NOT ENABLE tcp_tw_recycle. It's problematic with NAT and removed in kernels >= 4.12.
	# Setting to 0 ensures it's off or does nothing on newer ones.
	net.ipv4.tcp_tw_recycle = 0

	# TCP Keepalive settings: Detect dead connections.
	net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 mins of idleness
	net.ipv4.tcp_keepalive_intvl= 60 # Send subsequent probes every 60 seconds
	net.ipv4.tcp_keepalive_probes= 5 # Declare connection dead after 5 failed probes

	# Range of ephemeral ports for outgoing connections. Expands the available pool.
	# Max is 65535.
	net.ipv4.ip_local_port_range= 1024 65535

	# TCP receive and send buffer sizes (min, default, max in bytes).
	# These override net.core.rmem/wmem_default/max for TCP.
	net.ipv4.tcp_rmem = 4096 87380 16777216 # Max 16MB
	net.ipv4.tcp_wmem = 4096 65536 16777216 # Max 16MB

	# TCP congestion control algorithm. BBR often improves throughput and latency.
	# Ensure 'bbr' module is available and loaded.
	net.ipv4.tcp_congestion_control = bbr

	# Disable restarting TCP slow start after an idle period.
	# Can improve performance for connections that are idle then burst data (common for web).
	net.ipv4.tcp_slow_start_after_idle = 0

	# Enable Path MTU discovery probing. '1' enables after ICMP black hole detection.
	net.ipv4.tcp_mtu_probing = 1

	# Do not save TCP metrics from closed connections. Can be useful if routes/paths change.
	net.ipv4.tcp_no_metrics_save= 1

	########################
	# Conntrack (Netfilter Connection Tracking)
	# Important if the server is behind a firewall, acts as a firewall/NAT,
	# or uses local firewall rules extensively.
	# Adjust based on actual usage (cat /proc/sys/net/netfilter/nf_conntrack_count)
	# Each entry uses ~300 bytes of non-swappable kernel memory.
	########################
	net.netfilter.nf_conntrack_max = 524288 # Max entries (e.g., ~150MB RAM)
	net.netfilter.nf_conntrack_tcp_timeout_established = 7200 # 2 hours for established TCP
	net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Must be >= tcp_fin_timeout
	net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
	net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 120

	########################
	# Security & Misc
	########################
	# Enable strict reverse path filtering to prevent IP spoofing
	net.ipv4.conf.all.rp_filter = 1
	net.ipv4.conf.default.rp_filter = 1

	# Disable acceptance of ICMP redirect messages (potential MITM vector)
	net.ipv4.conf.all.accept_redirects= 0
	net.ipv4.conf.default.accept_redirects= 0

	# Disable acceptance of source routed packets (security risk)
	net.ipv4.conf.all.accept_source_route = 0
	net.ipv4.conf.default.accept_source_route = 0

	# Log packets with impossible source addresses (martians)
	net.ipv4.conf.all.log_martians = 1
	net.ipv4.conf.default.log_martians = 1

	# For IPv6, if actively used:
	# net.ipv6.conf.all.accept_redirects = 0
	# net.ipv6.conf.default.accept_redirects = 0
	# net.ipv6.conf.all.accept_source_route = 0
	# net.ipv6.conf.default.accept_source_route = 0
	# /etc/sysctl.d/60-sysctl.conf
	# Optimized Generic Web + DB Server Tuning for CentOS 6 (kernel 2.6.32)
	# Based on original by Kawin Viriyaprasopsook <[email protected]>
	# Apply with: sysctl -p /etc/sysctl.d/60-sysctl.conf
	# or reboot. To apply to current system without reboot: sysctl --system

	########################
	# Kernel & Memory / I/O Writeback
	########################
	# Reduce console noise from kernel messages
	kernel.printk = 3 4 1 3

	# How aggressively the kernel will swap memory pages.
	# Lower values tell the kernel to prefer dropping caches over swapping.
	# 10 is a good starting point for DBs and general servers.
	vm.swappiness = 10

	# Tend to keep dentry/inode caches longer, good for frequent file access (DB data files, web static content).
	vm.vfs_cache_pressure = 50

	# Maximum percentage of total system memory that can hold dirty pages
	# before processes are forced to write data synchronously.
	# 15% is reasonable with moderately fast storage. For very slow disks, consider 10%.
	vm.dirty_ratio = 15

	# Percentage of total system memory where background kernel flusher threads
	# will start writing dirty data to disk.
	vm.dirty_background_ratio = 5

	# Minimum amount of free RAM (in KB) the system should maintain.
	# 64MB is a reasonable minimum for systems of this era.
	vm.min_free_kbytes = 65536

	# '1' means the kernel will always approve memory requests, potentially overcommitting.
	# Useful for applications like databases that might allocate large buffers upfront.
	# Monitor for OOM killer activity if memory is tight.
	vm.overcommit_memory = 1

	########################
	# File Handles & Tasks
	########################
	# System-wide limit for open file handles.
	# Good for web servers (many sockets) and DBs (many data files, connections).
	fs.file-max = 2097152

	# Maximum number of processes/threads. Useful for web servers with many workers.
	# Default is often 32768.
	kernel.pid_max = 65536

	########################
	# Network Core, Buffers & Queues
	########################
	# Maximum number of connection requests queued for a listening socket (listen() backlog).
	# Crucial for web servers handling many incoming connections.
	# Increased from original 1024.
	net.core.somaxconn = 4096

	# Maximum number of packets queued on the input side of a network interface
	# when the interface receives packets faster than the kernel can process them.
	# Good for 1GbE NICs under high load.
	net.core.netdev_max_backlog = 5000

	# Default socket receive buffer size (bytes).
	net.core.rmem_default = 262144
	# Default socket send buffer size (bytes).
	net.core.wmem_default = 262144
	# Maximum socket receive buffer size (bytes).
	net.core.rmem_max = 16777216 # 16MB
	# Maximum socket send buffer size (bytes).
	net.core.wmem_max = 16777216 # 16MB
	# Note: net.ipv4.tcp_rmem/wmem will override these for TCP sockets.

	########################
	# TCP Tuning (Kernel 2.6.32 specific considerations)
	########################
	# Enable TCP SYN cookies to help protect against SYN flood attacks.
	net.ipv4.tcp_syncookies = 1

	# Maximum number of remembered connection requests not yet acknowledged (SYN_RECV state).
	# Increase for busy web servers. Should generally be >= somaxconn.
	net.ipv4.tcp_max_syn_backlog= 4096

	# Time to hold a socket in FIN-WAIT-2 state. Default is 60.
	# Lowering helps free up socket resources faster on busy web servers.
	net.ipv4.tcp_fin_timeout = 25 # Slightly more conservative than 15-20 on newer kernels

	# Allow reusing sockets in TIME-WAIT state for new outgoing connections.
	# Requires tcp_timestamps=1. Useful if the server makes many outgoing connections.
	net.ipv4.tcp_tw_reuse = 1

	# Enable TCP Timestamps. Required for tcp_tw_reuse and helps with RTT estimation.
	# Often on by default, but explicit is better.
	net.ipv4.tcp_timestamps = 1

	# DO NOT ENABLE tcp_tw_recycle. It's problematic with NAT.
	# Setting to 0 ensures it's off.
	net.ipv4.tcp_tw_recycle = 0

	# Range of ephemeral ports for outgoing connections. Expands the available pool.
	# Max is 65535.
	net.ipv4.ip_local_port_range= 1024 65535

	# TCP Keepalive settings: Detect dead connections.
	net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 mins of idleness
	net.ipv4.tcp_keepalive_intvl= 60 # Send subsequent probes every 60 seconds
	net.ipv4.tcp_keepalive_probes= 5 # Declare connection dead after 5 failed probes

	# TCP receive and send buffer sizes (min, default, max in bytes).
	# These override net.core.rmem/wmem_default/max for TCP.
	# Max values match net.core limits.
	net.ipv4.tcp_rmem = 4096 87380 16777216
	net.ipv4.tcp_wmem = 4096 65536 16777216

	# Enable TCP window scaling (RFC 1323). Usually on by default.
	net.ipv4.tcp_window_scaling = 1
	# Enable Selective Acknowledgements (SACK). Usually on by default.
	net.ipv4.tcp_sack = 1

	# Enable TCP auto-tuning of receive buffer. Usually on by default.
	net.ipv4.tcp_moderate_rcvbuf= 1

	# TCP Congestion Control: CentOS 6 (kernel 2.6.32) defaults to 'cubic'.
	# 'bbr' is not available. 'cubic' is generally the best choice for this kernel.
	# No need to set it if it's already the default, but you could explicitly set:
	# net.ipv4.tcp_congestion_control = cubic

	# Disable restarting TCP slow start after an idle period.
	# Can improve performance for connections that are idle then burst data (common for web).
	net.ipv4.tcp_slow_start_after_idle = 0

	########################
	# Conntrack (Netfilter Connection Tracking)
	# Important if the server has a local firewall (iptables) or handles NAT.
	# For CentOS 6, the parameter might be ip_conntrack_max if nf_conntrack module isn't dominant.
	# nf_conntrack is generally preferred if available and loaded.
	# Check with: lsmod \| grep -E "nf_conntrack\|ip_conntrack"
	# And then check existence of /proc/sys/net/netfilter/nf_conntrack_max or /proc/sys/net/ipv4/ip_conntrack_max
	########################
	# Assuming nf_conntrack is in use:
	net.netfilter.nf_conntrack_max = 262144 # Max entries (e.g., ~75MB RAM)
	net.netfilter.nf_conntrack_tcp_timeout_established = 7200 # 2 hours for established TCP
	net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Must be >= tcp_fin_timeout
	net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
	net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 120
	# If using the older ip_conntrack (less likely for later 2.6.32 but possible):
	# net.ipv4.ip_conntrack_max = 65536 # Example, adjust based on memory
	# net.ipv4.ip_conntrack_tcp_timeout_established = 7200

	########################
	# Security & Hardening
	########################
	# Enable strict reverse path filtering to prevent IP spoofing
	net.ipv4.conf.all.rp_filter = 1
	net.ipv4.conf.default.rp_filter = 1

	# Disable acceptance of ICMP redirect messages (potential MITM vector)
	net.ipv4.conf.all.accept_redirects= 0
	net.ipv4.conf.default.accept_redirects= 0

	# Disable acceptance of source routed packets (security risk)
	net.ipv4.conf.all.accept_source_route = 0
	net.ipv4.conf.default.accept_source_route = 0

	# Log packets with impossible source addresses (martians)
	net.ipv4.conf.all.log_martians = 1
	net.ipv4.conf.default.log_martians = 1

	########################
	# IPv6 (Optional - Disable if not used)
	# Disabling IPv6 can slightly reduce kernel overhead and attack surface if unused.
	########################
	# Uncomment the following lines if you are NOT using IPv6:
	# net.ipv6.conf.all.disable_ipv6 = 1
	# net.ipv6.conf.default.disable_ipv6 = 1
	# net.ipv6.conf.lo.disable_ipv6 = 1
	# /etc/sysctl.d/80-k8s-ipvs-optimized.conf
	# Optimized Kubernetes Node Tuning for kube-proxy IPVS mode
	# Based on original by Kawin Viriyaprasopsook <[email protected]>
	# Apply with: sysctl --system

	########################
	# System & Memory
	########################
	# Reduce console noise from kernel messages
	kernel.printk = 3 4 1 3

	# System-wide limit for open file handles. High value is good for many containers.
	fs.file-max = 2097152

	# Maximum number of processes/threads. Increased for many pods/containers.
	kernel.pid_max = 65536
	# Consider also kernel.threads-max if pid_max is very aggressively increased.

	# Strongly prefer dropping caches over swapping out Pod/application memory.
	# Kubernetes workloads should ideally have memory requests/limits set to avoid swapping.
	vm.swappiness = 10 # Some use 1 or 0 for K8s nodes if RAM is plentiful.

	# Allow memory overcommit. '1' means the kernel will always approve memory requests.
	# Crucial for K8s to allow Pods to start, but ensure proper resource requests/limits
	# to prevent actual memory exhaustion leading to OOM killer activity.
	vm.overcommit_memory = 1

	########################
	# TCP Performance
	########################
	# Enable TCP window scaling, selective acknowledgements, and timestamps. Standard for modern TCP.
	net.ipv4.tcp_window_scaling = 1
	net.ipv4.tcp_sack = 1
	net.ipv4.tcp_timestamps = 1 # Needed for tcp_tw_reuse, also helps RTT estimation.

	# Enable TCP Fast Open (TFO). '3' means enable for client and server.
	# Can reduce latency for repeated connections if supported by client and server.
	net.ipv4.tcp_fastopen = 3

	# Max SYN backlog. Increased to handle bursts of new connections to services.
	net.ipv4.tcp_max_syn_backlog = 8192

	# Enable TCP SYN cookies to help protect against SYN flood attacks.
	net.ipv4.tcp_syncookies = 1

	# Number of SYN retries before giving up. Lowering can speed up failure detection.
	# Default is often 5 or 6. For internal K8s traffic, 2-3 might be acceptable.
	net.ipv4.tcp_syn_retries = 3
	# Number of SYN+ACK retries. Similar considerations.
	net.ipv4.tcp_synack_retries = 3

	# Range of ephemeral ports for outgoing connections. Expands the available pool.
	net.ipv4.ip_local_port_range = 1024 65535

	# TCP congestion control. BBR is good for throughput and latency.
	# Ensure 'bbr' module is available and loaded, and ideally use 'fq' qdisc.
	net.ipv4.tcp_congestion_control = bbr

	# Disable restarting TCP slow start after an idle period.
	# Can improve performance for connections that are idle then burst data.
	net.ipv4.tcp_slow_start_after_idle = 0

	# Enable Path MTU discovery probing. '1' enables after ICMP black hole detection.
	net.ipv4.tcp_mtu_probing = 1

	# Do not save TCP metrics from closed connections. Can be useful if routes/paths change.
	net.ipv4.tcp_no_metrics_save = 1

	# Kernel attempts to aggregate small writes before sending (Nagle's algorithm related).
	# Default is usually 1.
	net.ipv4.tcp_autocorking = 1

	# Socket send buffer low watermark (SO_SNDLOWAT). 16KB.
	# Helps ensure NIC has data to send, potentially improving throughput.
	net.ipv4.tcp_notsent_lowat = 16384

	########################
	# Network Buffers & Qdisc
	########################
	# Default queuing discipline. fq_codel is excellent for fairness and reducing bufferbloat.
	# If using BBR, 'fq' is its canonical partner, but fq_codel is also a very strong choice.
	net.core.default_qdisc = fq_codel # or 'fq' if strictly following BBR's recommendation

	# Maximum number of packets queued on the input side of a network interface.
	# Very high value, good for high-speed NICs (10GbE+) to prevent drops.
	net.core.netdev_max_backlog = 30000

	# Maximum number of connection requests queued for a listening socket (listen() backlog)
	# Increased from original 1024 for better handling of connection bursts.
	net.core.somaxconn = 4096

	# Default and maximum TCP receive/send buffer sizes.
	# Linux auto-tuning (net.ipv4.tcp_moderate_rcvbuf=1 by default) usually works well.
	# These are still generous values:
	net.core.rmem_default = 1048576 # 1MB
	net.core.wmem_default = 1048576 # 1MB
	net.core.rmem_max = 16777216 # 16MB
	net.core.wmem_max = 16777216 # 16MB

	########################
	# Fragmentation & Time-Wait
	########################
	# Memory limits for IP fragment reassembly (bytes). Increased for robustness.
	net.ipv4.ipfrag_high_thresh = 4194304 # 4MB
	net.ipv4.ipfrag_low_thresh = 3145728 # 3MB
	# Timeout for reassembling fragments (seconds).
	net.ipv4.ipfrag_time = 30

	# Time to hold a socket in FIN-WAIT-2 state. Lowering helps free up socket resources faster.
	net.ipv4.tcp_fin_timeout = 15

	# tcp_tw_recycle: DO NOT ENABLE. Problematic with NAT, removed in kernels >= 4.12.
	# Setting to 0 ensures it's off or does nothing on newer kernels.
	net.ipv4.tcp_tw_recycle = 0

	# tcp_tw_reuse: Allow reusing sockets in TIME-WAIT state for new OUTGOING connections.
	# Requires tcp_timestamps=1. Generally safe and beneficial for nodes making many
	# outgoing connections (e.g., to backend Pods).
	net.ipv4.tcp_tw_reuse = 1

	# Maximum number of sockets in TIME-WAIT state.
	# High value for K8s due to high connection churn. Monitor actual usage.
	net.ipv4.tcp_max_tw_buckets = 1440000 # Default is often much lower.

	# TCP Keepalive settings: Detect dead connections.
	net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 mins of idleness
	net.ipv4.tcp_keepalive_intvl = 60 # Send subsequent probes every 60 seconds
	net.ipv4.tcp_keepalive_probes = 5 # Declare connection dead after 5 failed probes

	########################
	# Routing & Bridge/NF (Essential for K8s)
	########################
	# Enable IP forwarding. Critical for K8s nodes.
	net.ipv4.ip_forward = 1

	# Pass bridged IPv4/IPv6 traffic through netfilter (iptables/nftables) hooks.
	# CRITICAL for K8s CNI plugins and network policies to function correctly,
	# even when IPVS is used for service load balancing.
	net.bridge.bridge-nf-call-iptables = 1
	net.bridge.bridge-nf-call-ip6tables = 1

	# For IPv6 if used in the cluster:
	# net.ipv6.conf.all.forwarding = 1
	# net.ipv6.conf.default.forwarding = 1

	########################
	# IPVS (kube-proxy specific settings)
	########################
	# Enable IPVS connection tracking integration with netfilter conntrack.
	# This is often required for network policies (Calico, Cilium, etc.) to correctly
	# see and filter IPVS-handled traffic.
	net.ipv4.vs.conntrack = 1

	# Expire connections to a destination server when it's removed from the service.
	# Helps in faster convergence when backend Pods are deleted.
	net.ipv4.vs.expire_nodest_conn = 1

	# Expire persistent templates when a destination server is quiesced (weight 0).
	# Useful for graceful backend removal.
	net.ipv4.vs.expire_quiescent_template = 1

	# Be more lenient with TCP state transitions for IPVS. Can help with some clients/NAT.
	net.ipv4.vs.sloppy_tcp = 1
	# Be more lenient with UDP "connections" for IPVS.
	net.ipv4.vs.sloppy_udp = 1

	########################
	# Conntrack (Netfilter Connection Tracking)
	# Values depend heavily on cluster size, traffic patterns, and available RAM.
	# Each conntrack entry uses ~300 bytes of non-swappable kernel memory.
	########################
	# Maximum number of connection tracking entries.
	# 1,048,576 * ~300 bytes = ~300MB RAM. Adjust based on monitoring.
	# Monitor with: cat /proc/sys/net/netfilter/nf_conntrack_count
	net.netfilter.nf_conntrack_max = 1048576

	# Timeout for established TCP connections (seconds). Default is 5 days (432000).
	# 12 hours (43200) can help recycle entries faster but might drop long-lived idle connections.
	net.netfilter.nf_conntrack_tcp_timeout_established = 43200

	# Shorter timeouts for TCP connections in closing states.
	net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
	net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60
	# TIME_WAIT timeout in conntrack should generally align with net.ipv4.tcp_fin_timeout
	# or be slightly longer. Default is 120s.
	net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120

	# Timeout for generic (non-TCP/UDP/ICMP) protocol entries.
	net.netfilter.nf_conntrack_generic_timeout = 60

	########################
	# Security Hardening
	########################
	# Enable strict reverse path filtering to prevent IP spoofing
	net.ipv4.conf.all.rp_filter = 1
	net.ipv4.conf.default.rp_filter = 1 # Set to 2 (loose) if asymmetric routing is used on the node itself.

	# Disable acceptance of ICMP redirect messages (potential MITM vector)
	net.ipv4.conf.all.accept_redirects = 0
	net.ipv4.conf.default.accept_redirects = 0

	# Disable acceptance of source routed packets (security risk)
	net.ipv4.conf.all.accept_source_route = 0
	net.ipv4.conf.default.accept_source_route = 0

	# Log packets with impossible source addresses (martians)
	net.ipv4.conf.all.log_martians = 1
	net.ipv4.conf.default.log_martians = 1

	# Ignore ICMP echo requests to broadcast/multicast addresses
	net.ipv4.icmp_echo_ignore_broadcasts = 1

	# Ignore all ICMP timestamp requests (minor security hardening)
	net.ipv4.icmp_timestamp_ignore_all = 1

	########################
	# ARP Cache (Potentially useful for K8s with many pods/services on same L2)
	########################
	# Adjust ARP cache garbage collection thresholds if you have a very large number of
	# active IP addresses on the same L2 network as the node.
	# Threshold1: Soft limit, GC starts here.
	# Threshold2: Hard limit, GC becomes more aggressive.
	# Threshold3: Absolute max, entries might be dropped.
	# Defaults are often 128, 512, 1024. Increase if ARP cache overflows are suspected.
	# net.ipv4.neigh.default.gc_thresh1 = 1024
	# net.ipv4.neigh.default.gc_thresh2 = 2048
	# net.ipv4.neigh.default.gc_thresh3 = 4096
	# /etc/sysctl.d/80-k8s.conf
	# Optimized Kubernetes Node Network Tuning for kube-proxy native nftables mode
	# Based on original by Kawin Viriyaprasopsook <[email protected]>
	# Apply with: sysctl --system

	########################
	# Kernel & Memory
	########################
	# Reduce console noise from kernel messages
	kernel.printk = 3 4 1 3

	# Strongly prefer dropping caches over swapping out Pod/application memory.
	# Kubernetes workloads should ideally have memory requests/limits set to avoid swapping.
	vm.swappiness = 10 # Some even use 1 or 0 for K8s nodes if RAM is plentiful.

	# Allow memory overcommit. '1' means the kernel will always approve memory requests.
	# This can be useful for applications that request more memory than they immediately use.
	# However, ensure your K8s resource requests/limits are well-defined to prevent
	# actual memory exhaustion leading to OOM killer activity.
	vm.overcommit_memory = 1

	# System-wide limit for open file handles. High value is good for many containers.
	fs.file-max = 2097152

	# Maximum number of processes/threads. Increased for many pods/containers.
	kernel.pid_max = 65536
	# Consider also kernel.threads-max if pid_max is very aggressively increased,
	# though pid_max usually governs overall process/thread count.

	########################
	# TCP Performance
	########################
	# Enable TCP window scaling, selective acknowledgements, and timestamps.
	# These are standard for modern TCP and generally beneficial.
	net.ipv4.tcp_window_scaling = 1
	net.ipv4.tcp_sack = 1
	net.ipv4.tcp_timestamps = 1 # Needed for tcp_tw_reuse, also helps RTT estimation.

	# Enable TCP Fast Open (TFO). '3' means enable for client and server.
	# Can reduce latency for repeated connections if supported by client and server.
	net.ipv4.tcp_fastopen = 3

	# Max SYN backlog. Increased to handle bursts of new connections to services.
	net.ipv4.tcp_max_syn_backlog= 8192

	# Enable TCP SYN cookies to help protect against SYN flood attacks.
	net.ipv4.tcp_syncookies = 1

	# Number of SYN retries before giving up. Lowering can speed up failure detection
	# for unresponsive peers but might be too aggressive for lossy external networks.
	# Default is often 5 or 6. For internal K8s traffic, 2 might be acceptable.
	net.ipv4.tcp_syn_retries = 2
	# Number of SYN+ACK retries. Similar considerations as tcp_syn_retries.
	net.ipv4.tcp_synack_retries = 2

	# Range of ephemeral ports for outgoing connections. Expands the available pool.
	net.ipv4.ip_local_port_range= 1024 65535

	# TCP congestion control. BBR is good for throughput and latency.
	# Ensure 'bbr' module is available and loaded, and ideally use 'fq' qdisc.
	net.ipv4.tcp_congestion_control = bbr

	# Disable restarting TCP slow start after an idle period.
	# Can improve performance for connections that are idle then burst data.
	net.ipv4.tcp_slow_start_after_idle = 0

	# Enable Path MTU discovery probing.
	net.ipv4.tcp_mtu_probing = 1 # '0' (disabled), '1' (enabled only after ICMP black hole), '2' (always enabled)

	# Do not save TCP metrics from closed connections. Can be useful if routes/paths change.
	net.ipv4.tcp_no_metrics_save= 1

	# Kernel attempts to aggregate small writes before sending (Nagle's algorithm related).
	# Default is usually 1.
	net.ipv4.tcp_autocorking = 1

	# Socket send buffer low watermark (SO_SNDLOWAT). 16KB.
	# This tells the kernel to try to ensure there's at least this much unsent data
	# available for the NIC to DMA, potentially improving throughput for some drivers/NICs.
	net.ipv4.tcp_notsent_lowat = 16384

	########################
	# Network Buffers & Qdisc
	########################
	# Maximum number of packets queued on the input side of a network interface.
	# Very high value, good for high-speed NICs (10GbE+) to prevent drops.
	net.core.netdev_max_backlog= 30000

	# Default and maximum TCP receive/send buffer sizes.
	# Linux auto-tuning (net.ipv4.tcp_moderate_rcvbuf=1 by default) usually works well.
	# Setting very large fixed defaults can sometimes be counterproductive unless you have
	# specific high BDP (Bandwidth-Delay Product) paths that require them.
	# These are still generous values:
	net.core.rmem_default = 1048576 # 1MB
	net.core.wmem_default = 1048576 # 1MB
	net.core.rmem_max = 16777216 # 16MB
	net.core.wmem_max = 16777216 # 16MB

	# Default queuing discipline. fq_codel is excellent for fairness and reducing bufferbloat.
	# If using BBR, 'fq' is its canonical partner, but fq_codel is also a very strong choice.
	net.core.default_qdisc = fq_codel

	########################
	# IP Fragmentation
	########################
	# Memory limits for IP fragment reassembly.
	# These allow buffering more fragments if heavy fragmentation occurs.
	# K8s networking (CNIs) generally tries to avoid fragmentation.
	net.ipv4.ipfrag_high_thresh= 4194304 # Increased from 262144 (4MB)
	net.ipv4.ipfrag_low_thresh = 3145728 # Increased from 196608 (3MB)
	# Timeout for reassembling fragments (seconds).
	net.ipv4.ipfrag_time = 30

	########################
	# TIME-WAIT & Keepalive
	########################
	# tcp_tw_recycle: DO NOT ENABLE. Problematic with NAT, removed in kernels >= 4.12.
	# Setting to 0 ensures it's off or does nothing on newer kernels.
	net.ipv4.tcp_tw_recycle = 0

	# tcp_tw_reuse: Allow reusing sockets in TIME-WAIT state for new OUTGOING connections.
	# Requires tcp_timestamps=1. Generally safe and beneficial for nodes making many
	# outgoing connections (e.g., to other services, external APIs).
	net.ipv4.tcp_tw_reuse = 1

	# Maximum number of sockets in TIME-WAIT state.
	# High value for K8s due to high connection churn. Monitor actual usage.
	net.ipv4.tcp_max_tw_buckets= 1440000 # Default is often much lower.

	# TCP Keepalive settings: Detect dead connections.
	net.ipv4.tcp_keepalive_time= 600 # Send first keepalive after 10 mins of idleness
	net.ipv4.tcp_keepalive_intvl= 60 # Send subsequent probes every 60 seconds
	net.ipv4.tcp_keepalive_probes= 5 # Declare connection dead after 5 failed probes

	########################
	# Security & ICMP
	########################
	# Disable acceptance of ICMP redirect messages (potential MITM vector)
	net.ipv4.conf.all.accept_redirects = 0
	net.ipv4.conf.default.accept_redirects = 0

	# Disable acceptance of source routed packets (security risk)
	net.ipv4.conf.all.accept_source_route = 0
	net.ipv4.conf.default.accept_source_route= 0

	# Enable strict reverse path filtering to prevent IP spoofing
	net.ipv4.conf.all.rp_filter = 1
	net.ipv4.conf.default.rp_filter = 1 # Set to 2 (loose) if asymmetric routing is used on the node itself.

	# Log packets with impossible source addresses (martians)
	net.ipv4.conf.all.log_martians = 1

	# Ignore ICMP echo requests to broadcast/multicast addresses
	net.ipv4.icmp_echo_ignore_broadcasts = 1

	# Ignore all ICMP timestamp requests (minor security hardening)
	net.ipv4.icmp_timestamp_ignore_all = 1

	########################
	# Routing & Bridge/nft (Essential for K8s)
	########################
	# Enable IP forwarding. Critical for K8s nodes.
	net.ipv4.ip_forward = 1

	# Pass bridged IPv4/IPv6 traffic through netfilter (iptables/nftables) hooks.
	# CRITICAL for K8s CNI plugins and kube-proxy (even in nftables mode for its own rules)
	# to apply network policies, NAT, service routing, etc.
	net.bridge.bridge-nf-call-iptables = 1
	net.bridge.bridge-nf-call-ip6tables = 1
	# nftables uses the same L2 hooks as iptables for bridged traffic.

	# For IPv6 if used in the cluster:
	# net.ipv6.conf.all.forwarding = 1
	# net.ipv6.conf.default.forwarding = 1

	########################
	# Conntrack (nftables mode relies on nf_conntrack)
	# Values depend heavily on cluster size, traffic patterns, and available RAM.
	# Each conntrack entry uses ~300 bytes of non-swappable kernel memory.
	########################
	# Maximum number of connection tracking entries.
	# 1,048,576 * ~300 bytes = ~300MB RAM. Adjust based on monitoring.
	net.netfilter.nf_conntrack_max = 1048576
	# Monitor with: cat /proc/sys/net/netfilter/nf_conntrack_count

	# Timeout for established TCP connections (seconds). Default is 5 days (432000).
	# 12 hours (43200) can help recycle entries faster but might drop long-lived idle connections.
	net.netfilter.nf_conntrack_tcp_timeout_established = 43200

	# Shorter timeouts for TCP connections in closing states.
	net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
	net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60
	net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Should align with TCP TIME_WAIT

	# Timeout for generic (non-TCP/UDP/ICMP) protocol entries.
	net.netfilter.nf_conntrack_generic_timeout = 60

	########################
	# ARP Cache (Potentially useful for K8s with many pods/services)
	########################
	# Adjust ARP cache garbage collection thresholds if you have a very large number of
	# active IP addresses on the same L2 network as the node.
	# Threshold1: Soft limit, GC starts here.
	# Threshold2: Hard limit, GC becomes more aggressive.
	# Threshold3: Absolute max, entries might be dropped.
	# Defaults are often 128, 512, 1024. Increase if ARP cache overflows are suspected.
	# net.ipv4.neigh.default.gc_thresh1 = 1024
	# net.ipv4.neigh.default.gc_thresh2 = 2048
	# net.ipv4.neigh.default.gc_thresh3 = 4096
	# /etc/sysctl.d/80-pve.conf
	# Optimized Proxmox VE Host Tuning
	# Based on original by Kawin Viriyaprasopsook <[email protected]>
	# Apply with: sysctl --system

	########################
	# Memory & VM Caching
	########################
	# Reduce console noise from kernel messages
	kernel.printk = 3 4 1 3

	# Strongly prefer dropping caches over swapping out application (VM) memory
	vm.swappiness = 10

	# Tend to keep dentry/inode caches longer, good for frequent file access (VM disks)
	vm.vfs_cache_pressure = 50

	# Max % of total memory for dirty pages before forcing synchronous writes
	vm.dirty_ratio = 10
	# % of total memory for dirty pages before background kernel flusher threads start writing
	vm.dirty_background_ratio = 5
	# For systems with very fast storage and lots of RAM, consider using
	# vm.dirty_bytes and vm.dirty_background_bytes for more absolute control.

	# Minimum amount of free RAM (in KB) the system should maintain (128MB)
	# Increase if you have very large amounts of RAM (e.g., 256GB+)
	vm.min_free_kbytes = 131072

	# Maximum number of memory map areas a process can have
	# Increased for some containerized workloads (e.g., Elasticsearch) or complex applications
	vm.max_map_count = 262144

	########################
	# File Handles & PIDs
	########################
	# System-wide limit for open file handles
	fs.file-max = 2097152

	# Maximum number of processes/threads the system can have
	# Increased from default (often 32768) for busy virtualization hosts
	kernel.pid_max = 65536
	# Consider kernel.threads-max as well if pid_max is significantly increased,
	# though pid_max often covers thread limits too.

	########################
	# Networking (Bridges & Forwarding)
	########################
	# Enable IP forwarding for guest routing/NAT
	net.ipv4.ip_forward = 1

	# Pass bridged traffic through the host's iptables/ip6tables/arptables chains.
	# REQUIRED if using Proxmox VE firewall for VMs on bridges.
	# Set to 0 if NOT using PVE firewall for bridged VMs and security is handled elsewhere,
	# for a slight performance gain by bypassing host netfilter for bridged packets.
	net.bridge.bridge-nf-call-iptables = 1
	net.bridge.bridge-nf-call-ip6tables = 1
	net.bridge.bridge-nf-call-arptables = 1

	########################
	# TCP & Socket Queues
	########################
	# Maximum number of connection requests queued for a listening socket (listen() backlog)
	net.core.somaxconn = 4096

	# Maximum number of packets queued on the input side of a network interface
	# when the interface receives packets faster than the kernel can process them.
	# Good for 10GbE+ or busy 1GbE. Monitor for rx_dropped.
	net.core.netdev_max_backlog = 5000

	# Default and maximum TCP receive buffer size (bytes)
	net.core.rmem_default = 262144
	net.core.rmem_max = 16777216
	# Default and maximum TCP send buffer size (bytes)
	net.core.wmem_default = 262144
	net.core.wmem_max = 16777216

	# Maximum number of remembered connection requests not yet acknowledged (SYN_RECV state)
	net.ipv4.tcp_max_syn_backlog= 2048

	# Enable TCP SYN cookies to help protect against SYN flood attacks
	net.ipv4.tcp_syncookies = 1

	# Time to hold a socket in FIN-WAIT-2 state. Default is 60.
	# Lowering helps free up socket resources faster on busy servers.
	net.ipv4.tcp_fin_timeout = 15

	# Allow reusing sockets in TIME-WAIT state for new outgoing connections. Generally safe.
	net.ipv4.tcp_tw_reuse = 1

	# DO NOT ENABLE tcp_tw_recycle. It's problematic with NAT and removed in kernels >= 4.12.
	# Setting to 0 ensures it's off if on an older kernel, or does nothing on newer ones.
	net.ipv4.tcp_tw_recycle = 0

	# Range of ephemeral ports for outgoing connections. Expands the available pool.
	net.ipv4.ip_local_port_range= 1024 65535

	# TCP congestion control algorithm. BBR often improves throughput and latency.
	# Ensure 'bbr' module is available and loaded. 'cubic' is the older default.
	# Check availability: sysctl net.ipv4.tcp_available_congestion_control
	net.ipv4.tcp_congestion_control = bbr

	########################
	# Conntrack (Connection Tracking - for Netfilter/iptables)
	# Adjust based on actual usage (cat /proc/sys/net/netfilter/nf_conntrack_count)
	# Each entry uses ~300 bytes of non-swappable kernel memory.
	########################
	# Maximum number of connection tracking entries.
	net.netfilter.nf_conntrack_max = 262144
	# How long to keep an established TCP connection in the tracking table (seconds, 1 day)
	net.netfilter.nf_conntrack_tcp_timeout_established = 86400
	# Timeout for generic (non-TCP/UDP/ICMP) protocol entries (seconds, 5 minutes)
	net.netfilter.nf_conntrack_generic_timeout = 300

	########################
	# Security Hardening
	########################
	# Enable strict reverse path filtering to prevent IP spoofing
	net.ipv4.conf.all.rp_filter = 1
	net.ipv4.conf.default.rp_filter = 1

	# Disable acceptance of ICMP redirect messages (potential MITM vector)
	net.ipv4.conf.all.accept_redirects= 0
	net.ipv4.conf.default.accept_redirects= 0

	# Disable acceptance of source routed packets (security risk)
	net.ipv4.conf.all.accept_source_route = 0
	net.ipv4.conf.default.accept_source_route = 0

	# Log packets with impossible addresses (martians)
	net.ipv4.conf.all.log_martians = 1
	net.ipv4.conf.default.log_martians = 1

	# For IPv6, similar hardening can be applied if IPv6 is actively used:
	# net.ipv6.conf.all.accept_redirects = 0
	# net.ipv6.conf.default.accept_redirects = 0
	# net.ipv6.conf.all.accept_source_route = 0
	# net.ipv6.conf.default.accept_source_route = 0