Last active
May 26, 2025 03:40
-
-
Save bouroo/bc52ad58a6e75d44e5235b229e9ca988 to your computer and use it in GitHub Desktop.
Kernel tuning for dedicated linux server. /etc/sysctl.d/60-sysctl.conf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /etc/sysctl.d/60-sysctl.conf | |
# Generic Web + DB Server Tuning | |
# Based on original by Kawin Viriyaprasopsook <[email protected]> | |
# Apply with: sysctl --system | |
######################## | |
# Kernel & Memory | |
######################## | |
# Reduce console noise from kernel messages | |
kernel.printk = 3 4 1 3 | |
# How aggressively the kernel will swap memory pages. | |
# Lower values tell the kernel to prefer dropping caches over swapping. | |
# 10 is a good starting point for DBs and general servers. | |
vm.swappiness = 10 | |
# Maximum percentage of total system memory that can hold dirty pages | |
# before processes are forced to write data synchronously. | |
# 15% is reasonable with moderately fast storage. Consider lower (e.g., 10%) for slower disks. | |
vm.dirty_ratio = 15 | |
# Percentage of total system memory where background kernel flusher threads | |
# will start writing dirty data to disk. | |
vm.dirty_background_ratio = 5 | |
# '1' means the kernel will always approve memory requests, potentially overcommitting. | |
# Useful for applications like databases that might allocate large buffers upfront. | |
# Monitor for OOM killer activity if memory is tight. | |
vm.overcommit_memory = 1 | |
# System-wide limit for open file handles. | |
# 4,194,304 is very generous. 2,097,152 is also often sufficient. | |
# Good for web servers (many sockets) and DBs (many data files, connections). | |
fs.file-max = 4194304 | |
# Maximum number of processes/threads. Useful for web servers with many workers. | |
# Default is often 32768. | |
kernel.pid_max = 65536 | |
# Minimum amount of free RAM (in KB) the system should maintain. | |
# Prevents system from running completely out of memory for critical operations. | |
# 128MB is a common value. Increase for systems with very large RAM (e.g. 256MB for 128GB+ RAM) | |
vm.min_free_kbytes = 131072 | |
######################## | |
# Network Core, Buffers & Qdisc | |
######################## | |
# Default queuing discipline. fq_codel is excellent for reducing bufferbloat and ensuring fairness. | |
# If using BBR for TCP congestion control, 'fq' is its canonical partner, but fq_codel is also very good. | |
net.core.default_qdisc = fq_codel # or 'fq' | |
# Maximum number of packets queued on the input side of a network interface | |
# when the interface receives packets faster than the kernel can process them. | |
# Good for 1GbE+ NICs under high load. | |
net.core.netdev_max_backlog = 5000 | |
# Maximum number of connection requests queued for a listening socket (listen() backlog). | |
# Crucial for web servers handling many incoming connections. | |
# 1024 is okay, but higher values like 4096 or 8192 are common for busy web servers. | |
net.core.somaxconn = 4096 | |
# Default and maximum socket receive buffer size (bytes). | |
# The original 30MB/64MB values are extremely large for defaults and can consume | |
# significant memory if many sockets are open. Linux auto-tuning is generally good. | |
# Moderated values are often better for general servers. | |
net.core.rmem_default = 262144 # 256KB | |
net.core.wmem_default = 262144 # 256KB | |
net.core.rmem_max = 16777216 # 16MB | |
net.core.wmem_max = 16777216 # 16MB | |
# Note: net.ipv4.tcp_rmem/wmem will override these for TCP sockets. | |
######################## | |
# TCP Tuning | |
######################## | |
# Enable TCP SYN cookies to help protect against SYN flood attacks. | |
net.ipv4.tcp_syncookies = 1 | |
# Maximum number of remembered connection requests not yet acknowledged (SYN_RECV state). | |
# Increase for busy web servers. | |
net.ipv4.tcp_max_syn_backlog= 4096 # Increased from 2048 | |
# Time to hold a socket in FIN-WAIT-2 state. Default is 60. | |
# Lowering helps free up socket resources faster on busy web servers. | |
net.ipv4.tcp_fin_timeout = 20 # Slightly more aggressive than 30 | |
# Allow reusing sockets in TIME-WAIT state for new outgoing connections. | |
# Requires tcp_timestamps=1. Useful if the server makes many outgoing connections | |
# (e.g., web server to backend DB, or DB to other services). | |
net.ipv4.tcp_tw_reuse = 1 | |
# Enable TCP Timestamps. Required for tcp_tw_reuse and helps with RTT estimation. | |
# Usually enabled by default, but good to be explicit. | |
net.ipv4.tcp_timestamps = 1 | |
# DO NOT ENABLE tcp_tw_recycle. It's problematic with NAT and removed in kernels >= 4.12. | |
# Setting to 0 ensures it's off or does nothing on newer ones. | |
net.ipv4.tcp_tw_recycle = 0 | |
# TCP Keepalive settings: Detect dead connections. | |
net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 mins of idleness | |
net.ipv4.tcp_keepalive_intvl= 60 # Send subsequent probes every 60 seconds | |
net.ipv4.tcp_keepalive_probes= 5 # Declare connection dead after 5 failed probes | |
# Range of ephemeral ports for outgoing connections. Expands the available pool. | |
# Max is 65535. | |
net.ipv4.ip_local_port_range= 1024 65535 | |
# TCP receive and send buffer sizes (min, default, max in bytes). | |
# These override net.core.rmem/wmem_default/max for TCP. | |
# Max values (64MB) are still very large for a generic server. | |
# Moderating max to 16MB or 32MB is usually sufficient. | |
# The default values here (87KB read, 64KB write) are reasonable starting points. | |
net.ipv4.tcp_rmem = 4096 87380 16777216 # Max 16MB | |
net.ipv4.tcp_wmem = 4096 65536 16777216 # Max 16MB | |
# TCP congestion control algorithm. BBR often improves throughput and latency, | |
# especially over connections with some packet loss or varying RTTs. | |
# Ensure 'bbr' module is available and loaded. | |
net.ipv4.tcp_congestion_control = bbr | |
# Disable restarting TCP slow start after an idle period. | |
# Can improve performance for connections that are idle then burst data (common for web). | |
net.ipv4.tcp_slow_start_after_idle = 0 | |
# Enable Path MTU discovery probing. '1' enables after ICMP black hole detection. | |
net.ipv4.tcp_mtu_probing = 1 | |
# Do not save TCP metrics from closed connections. Can be useful if routes/paths change. | |
net.ipv4.tcp_no_metrics_save= 1 | |
######################## | |
# Conntrack (Netfilter Connection Tracking) | |
# Important if the server is behind a firewall, acts as a firewall/NAT, | |
# or uses local firewall rules extensively. | |
# Adjust based on actual usage (cat /proc/sys/net/netfilter/nf_conntrack_count) | |
# Each entry uses ~300 bytes of non-swappable kernel memory. | |
######################## | |
net.netfilter.nf_conntrack_max = 524288 # Max entries (e.g., ~150MB RAM) | |
net.netfilter.nf_conntrack_tcp_timeout_established = 7200 # 2 hours for established TCP | |
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Must be >= tcp_fin_timeout | |
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60 | |
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 120 | |
######################## | |
# Security & Misc | |
######################## | |
# Enable strict reverse path filtering to prevent IP spoofing | |
net.ipv4.conf.all.rp_filter = 1 | |
net.ipv4.conf.default.rp_filter = 1 | |
# Disable acceptance of ICMP redirect messages (potential MITM vector) | |
net.ipv4.conf.all.accept_redirects= 0 | |
net.ipv4.conf.default.accept_redirects= 0 | |
# Disable acceptance of source routed packets (security risk) | |
net.ipv4.conf.all.accept_source_route = 0 | |
net.ipv4.conf.default.accept_source_route = 0 | |
# Log packets with impossible source addresses (martians) | |
net.ipv4.conf.all.log_martians = 1 | |
net.ipv4.conf.default.log_martians = 1 | |
# For IPv6, if actively used: | |
# net.ipv6.conf.all.accept_redirects = 0 | |
# net.ipv6.conf.default.accept_redirects = 0 | |
# net.ipv6.conf.all.accept_source_route = 0 | |
# net.ipv6.conf.default.accept_source_route = 0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /etc/sysctl.d/60-sysctl.conf | |
# Optimized Generic Web + DB Server Tuning for CentOS 6 (kernel 2.6.32) | |
# Based on original by Kawin Viriyaprasopsook <[email protected]> | |
# Apply with: sysctl -p /etc/sysctl.d/60-sysctl.conf | |
# or reboot. To apply to current system without reboot: sysctl --system | |
######################## | |
# Kernel & Memory / I/O Writeback | |
######################## | |
# Reduce console noise from kernel messages | |
kernel.printk = 3 4 1 3 | |
# How aggressively the kernel will swap memory pages. | |
# Lower values tell the kernel to prefer dropping caches over swapping. | |
# 10 is a good starting point for DBs and general servers. | |
vm.swappiness = 10 | |
# Tend to keep dentry/inode caches longer, good for frequent file access (DB data files, web static content). | |
vm.vfs_cache_pressure = 50 | |
# Maximum percentage of total system memory that can hold dirty pages | |
# before processes are forced to write data synchronously. | |
# 15% is reasonable with moderately fast storage. For very slow disks, consider 10%. | |
vm.dirty_ratio = 15 | |
# Percentage of total system memory where background kernel flusher threads | |
# will start writing dirty data to disk. | |
vm.dirty_background_ratio = 5 | |
# Minimum amount of free RAM (in KB) the system should maintain. | |
# 64MB is a reasonable minimum for systems of this era. | |
vm.min_free_kbytes = 65536 | |
# '1' means the kernel will always approve memory requests, potentially overcommitting. | |
# Useful for applications like databases that might allocate large buffers upfront. | |
# Monitor for OOM killer activity if memory is tight. | |
vm.overcommit_memory = 1 | |
######################## | |
# File Handles & Tasks | |
######################## | |
# System-wide limit for open file handles. | |
# Good for web servers (many sockets) and DBs (many data files, connections). | |
fs.file-max = 2097152 | |
# Maximum number of processes/threads. Useful for web servers with many workers. | |
# Default is often 32768. | |
kernel.pid_max = 65536 | |
######################## | |
# Network Core, Buffers & Queues | |
######################## | |
# Maximum number of connection requests queued for a listening socket (listen() backlog). | |
# Crucial for web servers handling many incoming connections. | |
# Increased from original 1024. | |
net.core.somaxconn = 4096 | |
# Maximum number of packets queued on the input side of a network interface | |
# when the interface receives packets faster than the kernel can process them. | |
# Good for 1GbE NICs under high load. | |
net.core.netdev_max_backlog = 5000 | |
# Default socket receive buffer size (bytes). | |
net.core.rmem_default = 262144 | |
# Default socket send buffer size (bytes). | |
net.core.wmem_default = 262144 | |
# Maximum socket receive buffer size (bytes). | |
net.core.rmem_max = 16777216 # 16MB | |
# Maximum socket send buffer size (bytes). | |
net.core.wmem_max = 16777216 # 16MB | |
# Note: net.ipv4.tcp_rmem/wmem will override these for TCP sockets. | |
######################## | |
# TCP Tuning (Kernel 2.6.32 specific considerations) | |
######################## | |
# Enable TCP SYN cookies to help protect against SYN flood attacks. | |
net.ipv4.tcp_syncookies = 1 | |
# Maximum number of remembered connection requests not yet acknowledged (SYN_RECV state). | |
# Increase for busy web servers. Should generally be >= somaxconn. | |
net.ipv4.tcp_max_syn_backlog= 4096 | |
# Time to hold a socket in FIN-WAIT-2 state. Default is 60. | |
# Lowering helps free up socket resources faster on busy web servers. | |
net.ipv4.tcp_fin_timeout = 25 # Slightly more conservative than 15-20 on newer kernels | |
# Allow reusing sockets in TIME-WAIT state for new outgoing connections. | |
# Requires tcp_timestamps=1. Useful if the server makes many outgoing connections. | |
net.ipv4.tcp_tw_reuse = 1 | |
# Enable TCP Timestamps. Required for tcp_tw_reuse and helps with RTT estimation. | |
# Often on by default, but explicit is better. | |
net.ipv4.tcp_timestamps = 1 | |
# DO NOT ENABLE tcp_tw_recycle. It's problematic with NAT. | |
# Setting to 0 ensures it's off. | |
net.ipv4.tcp_tw_recycle = 0 | |
# Range of ephemeral ports for outgoing connections. Expands the available pool. | |
# Max is 65535. | |
net.ipv4.ip_local_port_range= 1024 65535 | |
# TCP Keepalive settings: Detect dead connections. | |
net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 mins of idleness | |
net.ipv4.tcp_keepalive_intvl= 60 # Send subsequent probes every 60 seconds | |
net.ipv4.tcp_keepalive_probes= 5 # Declare connection dead after 5 failed probes | |
# TCP receive and send buffer sizes (min, default, max in bytes). | |
# These override net.core.rmem/wmem_default/max for TCP. | |
# Max values match net.core limits. | |
net.ipv4.tcp_rmem = 4096 87380 16777216 | |
net.ipv4.tcp_wmem = 4096 65536 16777216 | |
# Enable TCP window scaling (RFC 1323). Usually on by default. | |
net.ipv4.tcp_window_scaling = 1 | |
# Enable Selective Acknowledgements (SACK). Usually on by default. | |
net.ipv4.tcp_sack = 1 | |
# Enable TCP auto-tuning of receive buffer. Usually on by default. | |
net.ipv4.tcp_moderate_rcvbuf= 1 | |
# TCP Congestion Control: CentOS 6 (kernel 2.6.32) defaults to 'cubic'. | |
# 'bbr' is not available. 'cubic' is generally the best choice for this kernel. | |
# No need to set it if it's already the default, but you could explicitly set: | |
# net.ipv4.tcp_congestion_control = cubic | |
# Disable restarting TCP slow start after an idle period. | |
# Can improve performance for connections that are idle then burst data (common for web). | |
net.ipv4.tcp_slow_start_after_idle = 0 | |
######################## | |
# Conntrack (Netfilter Connection Tracking) | |
# Important if the server has a local firewall (iptables) or handles NAT. | |
# For CentOS 6, the parameter might be ip_conntrack_max if nf_conntrack module isn't dominant. | |
# nf_conntrack is generally preferred if available and loaded. | |
# Check with: lsmod | grep -E "nf_conntrack|ip_conntrack" | |
# And then check existence of /proc/sys/net/netfilter/nf_conntrack_max or /proc/sys/net/ipv4/ip_conntrack_max | |
######################## | |
# Assuming nf_conntrack is in use: | |
net.netfilter.nf_conntrack_max = 262144 # Max entries (e.g., ~75MB RAM) | |
net.netfilter.nf_conntrack_tcp_timeout_established = 7200 # 2 hours for established TCP | |
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Must be >= tcp_fin_timeout | |
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60 | |
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 120 | |
# If using the older ip_conntrack (less likely for later 2.6.32 but possible): | |
# net.ipv4.ip_conntrack_max = 65536 # Example, adjust based on memory | |
# net.ipv4.ip_conntrack_tcp_timeout_established = 7200 | |
######################## | |
# Security & Hardening | |
######################## | |
# Enable strict reverse path filtering to prevent IP spoofing | |
net.ipv4.conf.all.rp_filter = 1 | |
net.ipv4.conf.default.rp_filter = 1 | |
# Disable acceptance of ICMP redirect messages (potential MITM vector) | |
net.ipv4.conf.all.accept_redirects= 0 | |
net.ipv4.conf.default.accept_redirects= 0 | |
# Disable acceptance of source routed packets (security risk) | |
net.ipv4.conf.all.accept_source_route = 0 | |
net.ipv4.conf.default.accept_source_route = 0 | |
# Log packets with impossible source addresses (martians) | |
net.ipv4.conf.all.log_martians = 1 | |
net.ipv4.conf.default.log_martians = 1 | |
######################## | |
# IPv6 (Optional - Disable if not used) | |
# Disabling IPv6 can slightly reduce kernel overhead and attack surface if unused. | |
######################## | |
# Uncomment the following lines if you are NOT using IPv6: | |
# net.ipv6.conf.all.disable_ipv6 = 1 | |
# net.ipv6.conf.default.disable_ipv6 = 1 | |
# net.ipv6.conf.lo.disable_ipv6 = 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /etc/sysctl.d/80-k8s-ipvs-optimized.conf | |
# Optimized Kubernetes Node Tuning for kube-proxy IPVS mode | |
# Based on original by Kawin Viriyaprasopsook <[email protected]> | |
# Apply with: sysctl --system | |
######################## | |
# System & Memory | |
######################## | |
# Reduce console noise from kernel messages | |
kernel.printk = 3 4 1 3 | |
# System-wide limit for open file handles. High value is good for many containers. | |
fs.file-max = 2097152 | |
# Maximum number of processes/threads. Increased for many pods/containers. | |
kernel.pid_max = 65536 | |
# Consider also kernel.threads-max if pid_max is very aggressively increased. | |
# Strongly prefer dropping caches over swapping out Pod/application memory. | |
# Kubernetes workloads should ideally have memory requests/limits set to avoid swapping. | |
vm.swappiness = 10 # Some use 1 or 0 for K8s nodes if RAM is plentiful. | |
# Allow memory overcommit. '1' means the kernel will always approve memory requests. | |
# Crucial for K8s to allow Pods to start, but ensure proper resource requests/limits | |
# to prevent actual memory exhaustion leading to OOM killer activity. | |
vm.overcommit_memory = 1 | |
######################## | |
# TCP Performance | |
######################## | |
# Enable TCP window scaling, selective acknowledgements, and timestamps. Standard for modern TCP. | |
net.ipv4.tcp_window_scaling = 1 | |
net.ipv4.tcp_sack = 1 | |
net.ipv4.tcp_timestamps = 1 # Needed for tcp_tw_reuse, also helps RTT estimation. | |
# Enable TCP Fast Open (TFO). '3' means enable for client and server. | |
# Can reduce latency for repeated connections if supported by client and server. | |
net.ipv4.tcp_fastopen = 3 | |
# Max SYN backlog. Increased to handle bursts of new connections to services. | |
net.ipv4.tcp_max_syn_backlog = 8192 | |
# Enable TCP SYN cookies to help protect against SYN flood attacks. | |
net.ipv4.tcp_syncookies = 1 | |
# Number of SYN retries before giving up. Lowering can speed up failure detection. | |
# Default is often 5 or 6. For internal K8s traffic, 2-3 might be acceptable. | |
net.ipv4.tcp_syn_retries = 3 | |
# Number of SYN+ACK retries. Similar considerations. | |
net.ipv4.tcp_synack_retries = 3 | |
# Range of ephemeral ports for outgoing connections. Expands the available pool. | |
net.ipv4.ip_local_port_range = 1024 65535 | |
# TCP congestion control. BBR is good for throughput and latency. | |
# Ensure 'bbr' module is available and loaded, and ideally use 'fq' qdisc. | |
net.ipv4.tcp_congestion_control = bbr | |
# Disable restarting TCP slow start after an idle period. | |
# Can improve performance for connections that are idle then burst data. | |
net.ipv4.tcp_slow_start_after_idle = 0 | |
# Enable Path MTU discovery probing. '1' enables after ICMP black hole detection. | |
net.ipv4.tcp_mtu_probing = 1 | |
# Do not save TCP metrics from closed connections. Can be useful if routes/paths change. | |
net.ipv4.tcp_no_metrics_save = 1 | |
# Kernel attempts to aggregate small writes before sending (Nagle's algorithm related). | |
# Default is usually 1. | |
net.ipv4.tcp_autocorking = 1 | |
# Socket send buffer low watermark (SO_SNDLOWAT). 16KB. | |
# Helps ensure NIC has data to send, potentially improving throughput. | |
net.ipv4.tcp_notsent_lowat = 16384 | |
######################## | |
# Network Buffers & Qdisc | |
######################## | |
# Default queuing discipline. fq_codel is excellent for fairness and reducing bufferbloat. | |
# If using BBR, 'fq' is its canonical partner, but fq_codel is also a very strong choice. | |
net.core.default_qdisc = fq_codel # or 'fq' if strictly following BBR's recommendation | |
# Maximum number of packets queued on the input side of a network interface. | |
# Very high value, good for high-speed NICs (10GbE+) to prevent drops. | |
net.core.netdev_max_backlog = 30000 | |
# Maximum number of connection requests queued for a listening socket (listen() backlog) | |
# Increased from original 1024 for better handling of connection bursts. | |
net.core.somaxconn = 4096 | |
# Default and maximum TCP receive/send buffer sizes. | |
# The original values (30MB/64MB) are very large and might consume excessive memory | |
# per socket. Moderating these while still being generous is often better. | |
# Linux auto-tuning (net.ipv4.tcp_moderate_rcvbuf=1 by default) usually works well. | |
# These are still generous values: | |
net.core.rmem_default = 1048576 # 1MB | |
net.core.wmem_default = 1048576 # 1MB | |
net.core.rmem_max = 16777216 # 16MB | |
net.core.wmem_max = 16777216 # 16MB | |
######################## | |
# Fragmentation & Time-Wait | |
######################## | |
# Memory limits for IP fragment reassembly (bytes). Increased for robustness. | |
net.ipv4.ipfrag_high_thresh = 4194304 # 4MB | |
net.ipv4.ipfrag_low_thresh = 3145728 # 3MB | |
# Timeout for reassembling fragments (seconds). | |
net.ipv4.ipfrag_time = 30 | |
# Time to hold a socket in FIN-WAIT-2 state. Default is 60. | |
# Lowering helps free up socket resources faster on busy servers. | |
net.ipv4.tcp_fin_timeout = 15 | |
# tcp_tw_recycle: DO NOT ENABLE. Problematic with NAT, removed in kernels >= 4.12. | |
# Setting to 0 ensures it's off or does nothing on newer kernels. | |
net.ipv4.tcp_tw_recycle = 0 | |
# tcp_tw_reuse: Allow reusing sockets in TIME-WAIT state for new OUTGOING connections. | |
# Requires tcp_timestamps=1. Generally safe and beneficial for nodes making many | |
# outgoing connections (e.g., to backend Pods). Changed from original 0. | |
net.ipv4.tcp_tw_reuse = 1 | |
# Maximum number of sockets in TIME-WAIT state. | |
# High value for K8s due to high connection churn. Monitor actual usage. | |
net.ipv4.tcp_max_tw_buckets = 1440000 # Default is often much lower. | |
# TCP Keepalive settings: Detect dead connections. | |
net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 mins of idleness | |
net.ipv4.tcp_keepalive_intvl = 60 # Send subsequent probes every 60 seconds | |
net.ipv4.tcp_keepalive_probes = 5 # Declare connection dead after 5 failed probes | |
######################## | |
# Routing & Bridge/NF (Essential for K8s) | |
######################## | |
# Enable IP forwarding. Critical for K8s nodes. | |
net.ipv4.ip_forward = 1 | |
# Pass bridged IPv4/IPv6 traffic through netfilter (iptables/nftables) hooks. | |
# CRITICAL for K8s CNI plugins and network policies to function correctly, | |
# even when IPVS is used for service load balancing. | |
net.bridge.bridge-nf-call-iptables = 1 | |
net.bridge.bridge-nf-call-ip6tables = 1 | |
# For IPv6 if used in the cluster: | |
# net.ipv6.conf.all.forwarding = 1 | |
# net.ipv6.conf.default.forwarding = 1 | |
######################## | |
# IPVS (kube-proxy specific settings) | |
######################## | |
# Enable IPVS connection tracking integration with netfilter conntrack. | |
# This is often required for network policies (Calico, Cilium, etc.) to correctly | |
# see and filter IPVS-handled traffic. | |
net.ipv4.vs.conntrack = 1 | |
# Expire connections to a destination server when it's removed from the service. | |
# Helps in faster convergence when backend Pods are deleted. | |
net.ipv4.vs.expire_nodest_conn = 1 | |
# Expire persistent templates when a destination server is quiesced (weight 0). | |
# Useful for graceful backend removal. | |
net.ipv4.vs.expire_quiescent_template = 1 | |
# Be more lenient with TCP state transitions for IPVS. Can help with some clients/NAT. | |
net.ipv4.vs.sloppy_tcp = 1 | |
# Be more lenient with UDP "connections" for IPVS. | |
net.ipv4.vs.sloppy_udp = 1 | |
######################## | |
# Conntrack (Netfilter Connection Tracking) | |
# Values depend heavily on cluster size, traffic patterns, and available RAM. | |
# Each conntrack entry uses ~300 bytes of non-swappable kernel memory. | |
######################## | |
# Maximum number of connection tracking entries. | |
# 1,048,576 * ~300 bytes = ~300MB RAM. Adjust based on monitoring. | |
# Monitor with: cat /proc/sys/net/netfilter/nf_conntrack_count | |
net.netfilter.nf_conntrack_max = 1048576 | |
# Timeout for established TCP connections (seconds). Default is 5 days (432000). | |
# 12 hours (43200) can help recycle entries faster but might drop long-lived idle connections. | |
net.netfilter.nf_conntrack_tcp_timeout_established = 43200 | |
# Shorter timeouts for TCP connections in closing states. | |
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60 | |
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60 | |
# TIME_WAIT timeout in conntrack should generally align with net.ipv4.tcp_fin_timeout | |
# or be slightly longer. Default is 120s. | |
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 | |
# Timeout for generic (non-TCP/UDP/ICMP) protocol entries. | |
net.netfilter.nf_conntrack_generic_timeout = 60 | |
######################## | |
# Security Hardening | |
######################## | |
# Enable strict reverse path filtering to prevent IP spoofing | |
net.ipv4.conf.all.rp_filter = 1 | |
net.ipv4.conf.default.rp_filter = 1 # Set to 2 (loose) if asymmetric routing is used on the node itself. | |
# Disable acceptance of ICMP redirect messages (potential MITM vector) | |
net.ipv4.conf.all.accept_redirects = 0 | |
net.ipv4.conf.default.accept_redirects = 0 | |
# Disable acceptance of source routed packets (security risk) | |
net.ipv4.conf.all.accept_source_route = 0 | |
net.ipv4.conf.default.accept_source_route = 0 | |
# Log packets with impossible source addresses (martians) | |
net.ipv4.conf.all.log_martians = 1 | |
net.ipv4.conf.default.log_martians = 1 | |
# Ignore ICMP echo requests to broadcast/multicast addresses | |
net.ipv4.icmp_echo_ignore_broadcasts = 1 | |
# Ignore all ICMP timestamp requests (minor security hardening) | |
net.ipv4.icmp_timestamp_ignore_all = 1 | |
######################## | |
# ARP Cache (Potentially useful for K8s with many pods/services on same L2) | |
######################## | |
# Adjust ARP cache garbage collection thresholds if you have a very large number of | |
# active IP addresses on the same L2 network as the node. | |
# Threshold1: Soft limit, GC starts here. | |
# Threshold2: Hard limit, GC becomes more aggressive. | |
# Threshold3: Absolute max, entries might be dropped. | |
# Defaults are often 128, 512, 1024. Increase if ARP cache overflows are suspected. | |
# net.ipv4.neigh.default.gc_thresh1 = 1024 | |
# net.ipv4.neigh.default.gc_thresh2 = 2048 | |
# net.ipv4.neigh.default.gc_thresh3 = 4096 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /etc/sysctl.d/80-k8s.conf | |
# Optimized Kubernetes Node Network Tuning for kube-proxy native nftables mode | |
# Based on original by Kawin Viriyaprasopsook <[email protected]> | |
# Apply with: sysctl --system | |
######################## | |
# Kernel & Memory | |
######################## | |
# Reduce console noise from kernel messages | |
kernel.printk = 3 4 1 3 | |
# Strongly prefer dropping caches over swapping out Pod/application memory. | |
# Kubernetes workloads should ideally have memory requests/limits set to avoid swapping. | |
vm.swappiness = 10 # Some even use 1 or 0 for K8s nodes if RAM is plentiful. | |
# Allow memory overcommit. '1' means the kernel will always approve memory requests. | |
# This can be useful for applications that request more memory than they immediately use. | |
# However, ensure your K8s resource requests/limits are well-defined to prevent | |
# actual memory exhaustion leading to OOM killer activity. | |
vm.overcommit_memory = 1 | |
# System-wide limit for open file handles. High value is good for many containers. | |
fs.file-max = 2097152 | |
# Maximum number of processes/threads. Increased for many pods/containers. | |
kernel.pid_max = 65536 | |
# Consider also kernel.threads-max if pid_max is very aggressively increased, | |
# though pid_max usually governs overall process/thread count. | |
######################## | |
# TCP Performance | |
######################## | |
# Enable TCP window scaling, selective acknowledgements, and timestamps. | |
# These are standard for modern TCP and generally beneficial. | |
net.ipv4.tcp_window_scaling = 1 | |
net.ipv4.tcp_sack = 1 | |
net.ipv4.tcp_timestamps = 1 # Needed for tcp_tw_reuse, also helps RTT estimation. | |
# Enable TCP Fast Open (TFO). '3' means enable for client and server. | |
# Can reduce latency for repeated connections if supported by client and server. | |
net.ipv4.tcp_fastopen = 3 | |
# Max SYN backlog. Increased to handle bursts of new connections to services. | |
net.ipv4.tcp_max_syn_backlog= 8192 | |
# Enable TCP SYN cookies to help protect against SYN flood attacks. | |
net.ipv4.tcp_syncookies = 1 | |
# Number of SYN retries before giving up. Lowering can speed up failure detection | |
# for unresponsive peers but might be too aggressive for lossy external networks. | |
# Default is often 5 or 6. For internal K8s traffic, 2 might be acceptable. | |
net.ipv4.tcp_syn_retries = 2 | |
# Number of SYN+ACK retries. Similar considerations as tcp_syn_retries. | |
net.ipv4.tcp_synack_retries = 2 | |
# Range of ephemeral ports for outgoing connections. Expands the available pool. | |
net.ipv4.ip_local_port_range= 1024 65535 | |
# TCP congestion control. BBR is good for throughput and latency. | |
# Ensure 'bbr' module is available and loaded, and ideally use 'fq' qdisc. | |
net.ipv4.tcp_congestion_control = bbr | |
# Disable restarting TCP slow start after an idle period. | |
# Can improve performance for connections that are idle then burst data. | |
net.ipv4.tcp_slow_start_after_idle = 0 | |
# Enable Path MTU discovery probing. | |
net.ipv4.tcp_mtu_probing = 1 # '0' (disabled), '1' (enabled only after ICMP black hole), '2' (always enabled) | |
# Do not save TCP metrics from closed connections. Can be useful if routes/paths change. | |
net.ipv4.tcp_no_metrics_save= 1 | |
# Kernel attempts to aggregate small writes before sending (Nagle's algorithm related). | |
# Default is usually 1. | |
net.ipv4.tcp_autocorking = 1 | |
# Socket send buffer low watermark (SO_SNDLOWAT). 16KB. | |
# This tells the kernel to try to ensure there's at least this much unsent data | |
# available for the NIC to DMA, potentially improving throughput for some drivers/NICs. | |
net.ipv4.tcp_notsent_lowat = 16384 | |
######################## | |
# Network Buffers & Qdisc | |
######################## | |
# Maximum number of packets queued on the input side of a network interface. | |
# Very high value, good for high-speed NICs (10GbE+) to prevent drops. | |
net.core.netdev_max_backlog= 30000 | |
# Default and maximum TCP receive/send buffer sizes. | |
# The original values (30MB/64MB) are very large and might consume excessive memory | |
# per socket, especially if many sockets don't need such large buffers. | |
# Moderating these while still being generous is often better. | |
# Linux auto-tuning (net.ipv4.tcp_moderate_rcvbuf=1 by default) usually works well. | |
# Setting very large fixed defaults can sometimes be counterproductive unless you have | |
# specific high BDP (Bandwidth-Delay Product) paths that require them. | |
# These are still generous values: | |
net.core.rmem_default = 1048576 # 1MB | |
net.core.wmem_default = 1048576 # 1MB | |
net.core.rmem_max = 16777216 # 16MB | |
net.core.wmem_max = 16777216 # 16MB | |
# Default queuing discipline. fq_codel is excellent for fairness and reducing bufferbloat. | |
# If using BBR, 'fq' is its canonical partner, but fq_codel is also a very strong choice. | |
net.core.default_qdisc = fq_codel | |
######################## | |
# IP Fragmentation | |
######################## | |
# Memory limits for IP fragment reassembly. | |
# These allow buffering more fragments if heavy fragmentation occurs. | |
# K8s networking (CNIs) generally tries to avoid fragmentation. | |
net.ipv4.ipfrag_high_thresh= 4194304 # Increased from 262144 (4MB) | |
net.ipv4.ipfrag_low_thresh = 3145728 # Increased from 196608 (3MB) | |
# Timeout for reassembling fragments (seconds). | |
net.ipv4.ipfrag_time = 30 | |
######################## | |
# TIME-WAIT & Keepalive | |
######################## | |
# tcp_tw_recycle: DO NOT ENABLE. Problematic with NAT, removed in kernels >= 4.12. | |
# Setting to 0 ensures it's off or does nothing on newer kernels. | |
net.ipv4.tcp_tw_recycle = 0 | |
# tcp_tw_reuse: Allow reusing sockets in TIME-WAIT state for new OUTGOING connections. | |
# Requires tcp_timestamps=1. Generally safe and beneficial for nodes making many | |
# outgoing connections (e.g., to other services, external APIs). | |
# Original had 0, changing to 1 is generally recommended for busy servers. | |
net.ipv4.tcp_tw_reuse = 1 | |
# Maximum number of sockets in TIME-WAIT state. | |
# High value for K8s due to high connection churn. Monitor actual usage. | |
net.ipv4.tcp_max_tw_buckets= 1440000 # Default is often much lower. | |
# TCP Keepalive settings: Detect dead connections. | |
net.ipv4.tcp_keepalive_time= 600 # Send first keepalive after 10 mins of idleness | |
net.ipv4.tcp_keepalive_intvl= 60 # Send subsequent probes every 60 seconds | |
net.ipv4.tcp_keepalive_probes= 5 # Declare connection dead after 5 failed probes | |
######################## | |
# Security & ICMP | |
######################## | |
# Disable acceptance of ICMP redirect messages (potential MITM vector) | |
net.ipv4.conf.all.accept_redirects = 0 | |
net.ipv4.conf.default.accept_redirects = 0 | |
# Disable acceptance of source routed packets (security risk) | |
net.ipv4.conf.all.accept_source_route = 0 | |
net.ipv4.conf.default.accept_source_route= 0 | |
# Enable strict reverse path filtering to prevent IP spoofing | |
net.ipv4.conf.all.rp_filter = 1 | |
net.ipv4.conf.default.rp_filter = 1 # Set to 2 (loose) if asymmetric routing is used on the node itself. | |
# Log packets with impossible source addresses (martians) | |
net.ipv4.conf.all.log_martians = 1 | |
# Ignore ICMP echo requests to broadcast/multicast addresses | |
net.ipv4.icmp_echo_ignore_broadcasts = 1 | |
# Ignore all ICMP timestamp requests (minor security hardening) | |
net.ipv4.icmp_timestamp_ignore_all = 1 | |
######################## | |
# Routing & Bridge/nft (Essential for K8s) | |
######################## | |
# Enable IP forwarding. Critical for K8s nodes. | |
net.ipv4.ip_forward = 1 | |
# Pass bridged IPv4/IPv6 traffic through netfilter (iptables/nftables) hooks. | |
# CRITICAL for K8s CNI plugins and kube-proxy (even in nftables mode for its own rules) | |
# to apply network policies, NAT, service routing, etc. | |
net.bridge.bridge-nf-call-iptables = 1 | |
net.bridge.bridge-nf-call-ip6tables = 1 | |
# nftables uses the same L2 hooks as iptables for bridged traffic. | |
# For IPv6 if used in the cluster: | |
# net.ipv6.conf.all.forwarding = 1 | |
# net.ipv6.conf.default.forwarding = 1 | |
######################## | |
# Conntrack (nftables mode relies on nf_conntrack) | |
# Values depend heavily on cluster size, traffic patterns, and available RAM. | |
# Each conntrack entry uses ~300 bytes of non-swappable kernel memory. | |
######################## | |
# Maximum number of connection tracking entries. | |
# 1,048,576 * ~300 bytes = ~300MB RAM. Adjust based on monitoring. | |
net.netfilter.nf_conntrack_max = 1048576 | |
# Monitor with: cat /proc/sys/net/netfilter/nf_conntrack_count | |
# Timeout for established TCP connections (seconds). Default is 5 days (432000). | |
# 12 hours (43200) can help recycle entries faster but might drop long-lived idle connections. | |
net.netfilter.nf_conntrack_tcp_timeout_established = 43200 | |
# Shorter timeouts for TCP connections in closing states. | |
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60 | |
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60 | |
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Should align with TCP TIME_WAIT | |
# Timeout for generic (non-TCP/UDP/ICMP) protocol entries. | |
net.netfilter.nf_conntrack_generic_timeout = 60 | |
######################## | |
# ARP Cache (Potentially useful for K8s with many pods/services) | |
######################## | |
# Adjust ARP cache garbage collection thresholds if you have a very large number of | |
# active IP addresses on the same L2 network as the node. | |
# Threshold1: Soft limit, GC starts here. | |
# Threshold2: Hard limit, GC becomes more aggressive. | |
# Threshold3: Absolute max, entries might be dropped. | |
# Defaults are often 128, 512, 1024. Increase if ARP cache overflows are suspected. | |
# net.ipv4.neigh.default.gc_thresh1 = 1024 | |
# net.ipv4.neigh.default.gc_thresh2 = 2048 | |
# net.ipv4.neigh.default.gc_thresh3 = 4096 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /etc/sysctl.d/80-pve.conf | |
# Optimized Proxmox VE Host Tuning | |
# Based on original by Kawin Viriyaprasopsook <[email protected]> | |
# Apply with: sysctl --system | |
######################## | |
# Memory & VM Caching | |
######################## | |
# Reduce console noise from kernel messages | |
kernel.printk = 3 4 1 3 | |
# Strongly prefer dropping caches over swapping out application (VM) memory | |
vm.swappiness = 10 | |
# Tend to keep dentry/inode caches longer, good for frequent file access (VM disks) | |
vm.vfs_cache_pressure = 50 | |
# Max % of total memory for dirty pages before forcing synchronous writes | |
vm.dirty_ratio = 10 | |
# % of total memory for dirty pages before background kernel flusher threads start writing | |
vm.dirty_background_ratio = 5 | |
# For systems with very fast storage and lots of RAM, consider using | |
# vm.dirty_bytes and vm.dirty_background_bytes for more absolute control. | |
# Minimum amount of free RAM (in KB) the system should maintain (128MB) | |
# Increase if you have very large amounts of RAM (e.g., 256GB+) | |
vm.min_free_kbytes = 131072 | |
# Maximum number of memory map areas a process can have | |
# Increased for some containerized workloads (e.g., Elasticsearch) or complex applications | |
vm.max_map_count = 262144 | |
######################## | |
# File Handles & PIDs | |
######################## | |
# System-wide limit for open file handles | |
fs.file-max = 2097152 | |
# Maximum number of processes/threads the system can have | |
# Increased from default (often 32768) for busy virtualization hosts | |
kernel.pid_max = 65536 | |
# Consider kernel.threads-max as well if pid_max is significantly increased, | |
# though pid_max often covers thread limits too. | |
######################## | |
# Networking (Bridges & Forwarding) | |
######################## | |
# Enable IP forwarding for guest routing/NAT | |
net.ipv4.ip_forward = 1 | |
# Pass bridged traffic through the host's iptables/ip6tables/arptables chains. | |
# REQUIRED if using Proxmox VE firewall for VMs on bridges. | |
# Set to 0 if NOT using PVE firewall for bridged VMs and security is handled elsewhere, | |
# for a slight performance gain by bypassing host netfilter for bridged packets. | |
net.bridge.bridge-nf-call-iptables = 1 | |
net.bridge.bridge-nf-call-ip6tables = 1 | |
net.bridge.bridge-nf-call-arptables = 1 | |
######################## | |
# TCP & Socket Queues | |
######################## | |
# Maximum number of connection requests queued for a listening socket (listen() backlog) | |
net.core.somaxconn = 4096 | |
# Maximum number of packets queued on the input side of a network interface | |
# when the interface receives packets faster than the kernel can process them. | |
# Good for 10GbE+ or busy 1GbE. Monitor for rx_dropped. | |
net.core.netdev_max_backlog = 5000 | |
# Default and maximum TCP receive buffer size (bytes) | |
net.core.rmem_default = 262144 | |
net.core.rmem_max = 16777216 | |
# Default and maximum TCP send buffer size (bytes) | |
net.core.wmem_default = 262144 | |
net.core.wmem_max = 16777216 | |
# Maximum number of remembered connection requests not yet acknowledged (SYN_RECV state) | |
net.ipv4.tcp_max_syn_backlog= 2048 | |
# Enable TCP SYN cookies to help protect against SYN flood attacks | |
net.ipv4.tcp_syncookies = 1 | |
# Time to hold a socket in FIN-WAIT-2 state. Default is 60. | |
# Lowering helps free up socket resources faster on busy servers. | |
net.ipv4.tcp_fin_timeout = 15 | |
# Allow reusing sockets in TIME-WAIT state for new outgoing connections. Generally safe. | |
net.ipv4.tcp_tw_reuse = 1 | |
# DO NOT ENABLE tcp_tw_recycle. It's problematic with NAT and removed in kernels >= 4.12. | |
# Setting to 0 ensures it's off if on an older kernel, or does nothing on newer ones. | |
net.ipv4.tcp_tw_recycle = 0 | |
# Range of ephemeral ports for outgoing connections. Expands the available pool. | |
net.ipv4.ip_local_port_range= 1024 65535 | |
# TCP congestion control algorithm. BBR often improves throughput and latency. | |
# Ensure 'bbr' module is available and loaded. 'cubic' is the older default. | |
# Check availability: sysctl net.ipv4.tcp_available_congestion_control | |
net.ipv4.tcp_congestion_control = bbr | |
######################## | |
# Conntrack (Connection Tracking - for Netfilter/iptables) | |
# Adjust based on actual usage (cat /proc/sys/net/netfilter/nf_conntrack_count) | |
# Each entry uses ~300 bytes of non-swappable kernel memory. | |
######################## | |
# Maximum number of connection tracking entries. | |
net.netfilter.nf_conntrack_max = 262144 | |
# How long to keep an established TCP connection in the tracking table (seconds, 1 day) | |
net.netfilter.nf_conntrack_tcp_timeout_established = 86400 | |
# Timeout for generic (non-TCP/UDP/ICMP) protocol entries (seconds, 5 minutes) | |
net.netfilter.nf_conntrack_generic_timeout = 300 | |
######################## | |
# Security Hardening | |
######################## | |
# Enable strict reverse path filtering to prevent IP spoofing | |
net.ipv4.conf.all.rp_filter = 1 | |
net.ipv4.conf.default.rp_filter = 1 | |
# Disable acceptance of ICMP redirect messages (potential MITM vector) | |
net.ipv4.conf.all.accept_redirects= 0 | |
net.ipv4.conf.default.accept_redirects= 0 | |
# Disable acceptance of source routed packets (security risk) | |
net.ipv4.conf.all.accept_source_route = 0 | |
net.ipv4.conf.default.accept_source_route = 0 | |
# Log packets with impossible addresses (martians) | |
net.ipv4.conf.all.log_martians = 1 | |
net.ipv4.conf.default.log_martians = 1 | |
# For IPv6, similar hardening can be applied if IPv6 is actively used: | |
# net.ipv6.conf.all.accept_redirects = 0 | |
# net.ipv6.conf.default.accept_redirects = 0 | |
# net.ipv6.conf.all.accept_source_route = 0 | |
# net.ipv6.conf.default.accept_source_route = 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment