Skip to content

Instantly share code, notes, and snippets.

@bouroo
Last active May 26, 2025 03:40
Show Gist options
  • Save bouroo/bc52ad58a6e75d44e5235b229e9ca988 to your computer and use it in GitHub Desktop.
Save bouroo/bc52ad58a6e75d44e5235b229e9ca988 to your computer and use it in GitHub Desktop.
Kernel tuning for dedicated linux server. /etc/sysctl.d/60-sysctl.conf
# /etc/sysctl.d/60-sysctl.conf
# Generic Web + DB Server Tuning
# Based on original by Kawin Viriyaprasopsook <[email protected]>
# Apply with: sysctl --system
########################
# Kernel & Memory
########################
# Reduce console noise from kernel messages
kernel.printk = 3 4 1 3
# How aggressively the kernel will swap memory pages.
# Lower values tell the kernel to prefer dropping caches over swapping.
# 10 is a good starting point for DBs and general servers.
vm.swappiness = 10
# Maximum percentage of total system memory that can hold dirty pages
# before processes are forced to write data synchronously.
# 15% is reasonable with moderately fast storage. Consider lower (e.g., 10%) for slower disks.
vm.dirty_ratio = 15
# Percentage of total system memory where background kernel flusher threads
# will start writing dirty data to disk.
vm.dirty_background_ratio = 5
# '1' means the kernel will always approve memory requests, potentially overcommitting.
# Useful for applications like databases that might allocate large buffers upfront.
# Monitor for OOM killer activity if memory is tight.
vm.overcommit_memory = 1
# System-wide limit for open file handles.
# 4,194,304 is very generous. 2,097,152 is also often sufficient.
# Good for web servers (many sockets) and DBs (many data files, connections).
fs.file-max = 4194304
# Maximum number of processes/threads. Useful for web servers with many workers.
# Default is often 32768.
kernel.pid_max = 65536
# Minimum amount of free RAM (in KB) the system should maintain.
# Prevents system from running completely out of memory for critical operations.
# 128MB is a common value. Increase for systems with very large RAM (e.g. 256MB for 128GB+ RAM)
vm.min_free_kbytes = 131072
########################
# Network Core, Buffers & Qdisc
########################
# Default queuing discipline. fq_codel is excellent for reducing bufferbloat and ensuring fairness.
# If using BBR for TCP congestion control, 'fq' is its canonical partner, but fq_codel is also very good.
net.core.default_qdisc = fq_codel # or 'fq'
# Maximum number of packets queued on the input side of a network interface
# when the interface receives packets faster than the kernel can process them.
# Good for 1GbE+ NICs under high load.
net.core.netdev_max_backlog = 5000
# Maximum number of connection requests queued for a listening socket (listen() backlog).
# Crucial for web servers handling many incoming connections.
# 1024 is okay, but higher values like 4096 or 8192 are common for busy web servers.
net.core.somaxconn = 4096
# Default and maximum socket receive buffer size (bytes).
# The original 30MB/64MB values are extremely large for defaults and can consume
# significant memory if many sockets are open. Linux auto-tuning is generally good.
# Moderated values are often better for general servers.
net.core.rmem_default = 262144 # 256KB
net.core.wmem_default = 262144 # 256KB
net.core.rmem_max = 16777216 # 16MB
net.core.wmem_max = 16777216 # 16MB
# Note: net.ipv4.tcp_rmem/wmem will override these for TCP sockets.
########################
# TCP Tuning
########################
# Enable TCP SYN cookies to help protect against SYN flood attacks.
net.ipv4.tcp_syncookies = 1
# Maximum number of remembered connection requests not yet acknowledged (SYN_RECV state).
# Increase for busy web servers.
net.ipv4.tcp_max_syn_backlog= 4096 # Increased from 2048
# Time to hold a socket in FIN-WAIT-2 state. Default is 60.
# Lowering helps free up socket resources faster on busy web servers.
net.ipv4.tcp_fin_timeout = 20 # Slightly more aggressive than 30
# Allow reusing sockets in TIME-WAIT state for new outgoing connections.
# Requires tcp_timestamps=1. Useful if the server makes many outgoing connections
# (e.g., web server to backend DB, or DB to other services).
net.ipv4.tcp_tw_reuse = 1
# Enable TCP Timestamps. Required for tcp_tw_reuse and helps with RTT estimation.
# Usually enabled by default, but good to be explicit.
net.ipv4.tcp_timestamps = 1
# DO NOT ENABLE tcp_tw_recycle. It's problematic with NAT and removed in kernels >= 4.12.
# Setting to 0 ensures it's off or does nothing on newer ones.
net.ipv4.tcp_tw_recycle = 0
# TCP Keepalive settings: Detect dead connections.
net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 mins of idleness
net.ipv4.tcp_keepalive_intvl= 60 # Send subsequent probes every 60 seconds
net.ipv4.tcp_keepalive_probes= 5 # Declare connection dead after 5 failed probes
# Range of ephemeral ports for outgoing connections. Expands the available pool.
# Max is 65535.
net.ipv4.ip_local_port_range= 1024 65535
# TCP receive and send buffer sizes (min, default, max in bytes).
# These override net.core.rmem/wmem_default/max for TCP.
# Max values (64MB) are still very large for a generic server.
# Moderating max to 16MB or 32MB is usually sufficient.
# The default values here (87KB read, 64KB write) are reasonable starting points.
net.ipv4.tcp_rmem = 4096 87380 16777216 # Max 16MB
net.ipv4.tcp_wmem = 4096 65536 16777216 # Max 16MB
# TCP congestion control algorithm. BBR often improves throughput and latency,
# especially over connections with some packet loss or varying RTTs.
# Ensure 'bbr' module is available and loaded.
net.ipv4.tcp_congestion_control = bbr
# Disable restarting TCP slow start after an idle period.
# Can improve performance for connections that are idle then burst data (common for web).
net.ipv4.tcp_slow_start_after_idle = 0
# Enable Path MTU discovery probing. '1' enables after ICMP black hole detection.
net.ipv4.tcp_mtu_probing = 1
# Do not save TCP metrics from closed connections. Can be useful if routes/paths change.
net.ipv4.tcp_no_metrics_save= 1
########################
# Conntrack (Netfilter Connection Tracking)
# Important if the server is behind a firewall, acts as a firewall/NAT,
# or uses local firewall rules extensively.
# Adjust based on actual usage (cat /proc/sys/net/netfilter/nf_conntrack_count)
# Each entry uses ~300 bytes of non-swappable kernel memory.
########################
net.netfilter.nf_conntrack_max = 524288 # Max entries (e.g., ~150MB RAM)
net.netfilter.nf_conntrack_tcp_timeout_established = 7200 # 2 hours for established TCP
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Must be >= tcp_fin_timeout
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 120
########################
# Security & Misc
########################
# Enable strict reverse path filtering to prevent IP spoofing
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1
# Disable acceptance of ICMP redirect messages (potential MITM vector)
net.ipv4.conf.all.accept_redirects= 0
net.ipv4.conf.default.accept_redirects= 0
# Disable acceptance of source routed packets (security risk)
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route = 0
# Log packets with impossible source addresses (martians)
net.ipv4.conf.all.log_martians = 1
net.ipv4.conf.default.log_martians = 1
# For IPv6, if actively used:
# net.ipv6.conf.all.accept_redirects = 0
# net.ipv6.conf.default.accept_redirects = 0
# net.ipv6.conf.all.accept_source_route = 0
# net.ipv6.conf.default.accept_source_route = 0
# /etc/sysctl.d/60-sysctl.conf
# Optimized Generic Web + DB Server Tuning for CentOS 6 (kernel 2.6.32)
# Based on original by Kawin Viriyaprasopsook <[email protected]>
# Apply with: sysctl -p /etc/sysctl.d/60-sysctl.conf
# or reboot. To apply to current system without reboot: sysctl --system
########################
# Kernel & Memory / I/O Writeback
########################
# Reduce console noise from kernel messages
kernel.printk = 3 4 1 3
# How aggressively the kernel will swap memory pages.
# Lower values tell the kernel to prefer dropping caches over swapping.
# 10 is a good starting point for DBs and general servers.
vm.swappiness = 10
# Tend to keep dentry/inode caches longer, good for frequent file access (DB data files, web static content).
vm.vfs_cache_pressure = 50
# Maximum percentage of total system memory that can hold dirty pages
# before processes are forced to write data synchronously.
# 15% is reasonable with moderately fast storage. For very slow disks, consider 10%.
vm.dirty_ratio = 15
# Percentage of total system memory where background kernel flusher threads
# will start writing dirty data to disk.
vm.dirty_background_ratio = 5
# Minimum amount of free RAM (in KB) the system should maintain.
# 64MB is a reasonable minimum for systems of this era.
vm.min_free_kbytes = 65536
# '1' means the kernel will always approve memory requests, potentially overcommitting.
# Useful for applications like databases that might allocate large buffers upfront.
# Monitor for OOM killer activity if memory is tight.
vm.overcommit_memory = 1
########################
# File Handles & Tasks
########################
# System-wide limit for open file handles.
# Good for web servers (many sockets) and DBs (many data files, connections).
fs.file-max = 2097152
# Maximum number of processes/threads. Useful for web servers with many workers.
# Default is often 32768.
kernel.pid_max = 65536
########################
# Network Core, Buffers & Queues
########################
# Maximum number of connection requests queued for a listening socket (listen() backlog).
# Crucial for web servers handling many incoming connections.
# Increased from original 1024.
net.core.somaxconn = 4096
# Maximum number of packets queued on the input side of a network interface
# when the interface receives packets faster than the kernel can process them.
# Good for 1GbE NICs under high load.
net.core.netdev_max_backlog = 5000
# Default socket receive buffer size (bytes).
net.core.rmem_default = 262144
# Default socket send buffer size (bytes).
net.core.wmem_default = 262144
# Maximum socket receive buffer size (bytes).
net.core.rmem_max = 16777216 # 16MB
# Maximum socket send buffer size (bytes).
net.core.wmem_max = 16777216 # 16MB
# Note: net.ipv4.tcp_rmem/wmem will override these for TCP sockets.
########################
# TCP Tuning (Kernel 2.6.32 specific considerations)
########################
# Enable TCP SYN cookies to help protect against SYN flood attacks.
net.ipv4.tcp_syncookies = 1
# Maximum number of remembered connection requests not yet acknowledged (SYN_RECV state).
# Increase for busy web servers. Should generally be >= somaxconn.
net.ipv4.tcp_max_syn_backlog= 4096
# Time to hold a socket in FIN-WAIT-2 state. Default is 60.
# Lowering helps free up socket resources faster on busy web servers.
net.ipv4.tcp_fin_timeout = 25 # Slightly more conservative than 15-20 on newer kernels
# Allow reusing sockets in TIME-WAIT state for new outgoing connections.
# Requires tcp_timestamps=1. Useful if the server makes many outgoing connections.
net.ipv4.tcp_tw_reuse = 1
# Enable TCP Timestamps. Required for tcp_tw_reuse and helps with RTT estimation.
# Often on by default, but explicit is better.
net.ipv4.tcp_timestamps = 1
# DO NOT ENABLE tcp_tw_recycle. It's problematic with NAT.
# Setting to 0 ensures it's off.
net.ipv4.tcp_tw_recycle = 0
# Range of ephemeral ports for outgoing connections. Expands the available pool.
# Max is 65535.
net.ipv4.ip_local_port_range= 1024 65535
# TCP Keepalive settings: Detect dead connections.
net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 mins of idleness
net.ipv4.tcp_keepalive_intvl= 60 # Send subsequent probes every 60 seconds
net.ipv4.tcp_keepalive_probes= 5 # Declare connection dead after 5 failed probes
# TCP receive and send buffer sizes (min, default, max in bytes).
# These override net.core.rmem/wmem_default/max for TCP.
# Max values match net.core limits.
net.ipv4.tcp_rmem = 4096 87380 16777216
net.ipv4.tcp_wmem = 4096 65536 16777216
# Enable TCP window scaling (RFC 1323). Usually on by default.
net.ipv4.tcp_window_scaling = 1
# Enable Selective Acknowledgements (SACK). Usually on by default.
net.ipv4.tcp_sack = 1
# Enable TCP auto-tuning of receive buffer. Usually on by default.
net.ipv4.tcp_moderate_rcvbuf= 1
# TCP Congestion Control: CentOS 6 (kernel 2.6.32) defaults to 'cubic'.
# 'bbr' is not available. 'cubic' is generally the best choice for this kernel.
# No need to set it if it's already the default, but you could explicitly set:
# net.ipv4.tcp_congestion_control = cubic
# Disable restarting TCP slow start after an idle period.
# Can improve performance for connections that are idle then burst data (common for web).
net.ipv4.tcp_slow_start_after_idle = 0
########################
# Conntrack (Netfilter Connection Tracking)
# Important if the server has a local firewall (iptables) or handles NAT.
# For CentOS 6, the parameter might be ip_conntrack_max if nf_conntrack module isn't dominant.
# nf_conntrack is generally preferred if available and loaded.
# Check with: lsmod | grep -E "nf_conntrack|ip_conntrack"
# And then check existence of /proc/sys/net/netfilter/nf_conntrack_max or /proc/sys/net/ipv4/ip_conntrack_max
########################
# Assuming nf_conntrack is in use:
net.netfilter.nf_conntrack_max = 262144 # Max entries (e.g., ~75MB RAM)
net.netfilter.nf_conntrack_tcp_timeout_established = 7200 # 2 hours for established TCP
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Must be >= tcp_fin_timeout
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 120
# If using the older ip_conntrack (less likely for later 2.6.32 but possible):
# net.ipv4.ip_conntrack_max = 65536 # Example, adjust based on memory
# net.ipv4.ip_conntrack_tcp_timeout_established = 7200
########################
# Security & Hardening
########################
# Enable strict reverse path filtering to prevent IP spoofing
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1
# Disable acceptance of ICMP redirect messages (potential MITM vector)
net.ipv4.conf.all.accept_redirects= 0
net.ipv4.conf.default.accept_redirects= 0
# Disable acceptance of source routed packets (security risk)
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route = 0
# Log packets with impossible source addresses (martians)
net.ipv4.conf.all.log_martians = 1
net.ipv4.conf.default.log_martians = 1
########################
# IPv6 (Optional - Disable if not used)
# Disabling IPv6 can slightly reduce kernel overhead and attack surface if unused.
########################
# Uncomment the following lines if you are NOT using IPv6:
# net.ipv6.conf.all.disable_ipv6 = 1
# net.ipv6.conf.default.disable_ipv6 = 1
# net.ipv6.conf.lo.disable_ipv6 = 1
# /etc/sysctl.d/80-k8s-ipvs-optimized.conf
# Optimized Kubernetes Node Tuning for kube-proxy IPVS mode
# Based on original by Kawin Viriyaprasopsook <[email protected]>
# Apply with: sysctl --system
########################
# System & Memory
########################
# Reduce console noise from kernel messages
kernel.printk = 3 4 1 3
# System-wide limit for open file handles. High value is good for many containers.
fs.file-max = 2097152
# Maximum number of processes/threads. Increased for many pods/containers.
kernel.pid_max = 65536
# Consider also kernel.threads-max if pid_max is very aggressively increased.
# Strongly prefer dropping caches over swapping out Pod/application memory.
# Kubernetes workloads should ideally have memory requests/limits set to avoid swapping.
vm.swappiness = 10 # Some use 1 or 0 for K8s nodes if RAM is plentiful.
# Allow memory overcommit. '1' means the kernel will always approve memory requests.
# Crucial for K8s to allow Pods to start, but ensure proper resource requests/limits
# to prevent actual memory exhaustion leading to OOM killer activity.
vm.overcommit_memory = 1
########################
# TCP Performance
########################
# Enable TCP window scaling, selective acknowledgements, and timestamps. Standard for modern TCP.
net.ipv4.tcp_window_scaling = 1
net.ipv4.tcp_sack = 1
net.ipv4.tcp_timestamps = 1 # Needed for tcp_tw_reuse, also helps RTT estimation.
# Enable TCP Fast Open (TFO). '3' means enable for client and server.
# Can reduce latency for repeated connections if supported by client and server.
net.ipv4.tcp_fastopen = 3
# Max SYN backlog. Increased to handle bursts of new connections to services.
net.ipv4.tcp_max_syn_backlog = 8192
# Enable TCP SYN cookies to help protect against SYN flood attacks.
net.ipv4.tcp_syncookies = 1
# Number of SYN retries before giving up. Lowering can speed up failure detection.
# Default is often 5 or 6. For internal K8s traffic, 2-3 might be acceptable.
net.ipv4.tcp_syn_retries = 3
# Number of SYN+ACK retries. Similar considerations.
net.ipv4.tcp_synack_retries = 3
# Range of ephemeral ports for outgoing connections. Expands the available pool.
net.ipv4.ip_local_port_range = 1024 65535
# TCP congestion control. BBR is good for throughput and latency.
# Ensure 'bbr' module is available and loaded, and ideally use 'fq' qdisc.
net.ipv4.tcp_congestion_control = bbr
# Disable restarting TCP slow start after an idle period.
# Can improve performance for connections that are idle then burst data.
net.ipv4.tcp_slow_start_after_idle = 0
# Enable Path MTU discovery probing. '1' enables after ICMP black hole detection.
net.ipv4.tcp_mtu_probing = 1
# Do not save TCP metrics from closed connections. Can be useful if routes/paths change.
net.ipv4.tcp_no_metrics_save = 1
# Kernel attempts to aggregate small writes before sending (Nagle's algorithm related).
# Default is usually 1.
net.ipv4.tcp_autocorking = 1
# Socket send buffer low watermark (SO_SNDLOWAT). 16KB.
# Helps ensure NIC has data to send, potentially improving throughput.
net.ipv4.tcp_notsent_lowat = 16384
########################
# Network Buffers & Qdisc
########################
# Default queuing discipline. fq_codel is excellent for fairness and reducing bufferbloat.
# If using BBR, 'fq' is its canonical partner, but fq_codel is also a very strong choice.
net.core.default_qdisc = fq_codel # or 'fq' if strictly following BBR's recommendation
# Maximum number of packets queued on the input side of a network interface.
# Very high value, good for high-speed NICs (10GbE+) to prevent drops.
net.core.netdev_max_backlog = 30000
# Maximum number of connection requests queued for a listening socket (listen() backlog)
# Increased from original 1024 for better handling of connection bursts.
net.core.somaxconn = 4096
# Default and maximum TCP receive/send buffer sizes.
# The original values (30MB/64MB) are very large and might consume excessive memory
# per socket. Moderating these while still being generous is often better.
# Linux auto-tuning (net.ipv4.tcp_moderate_rcvbuf=1 by default) usually works well.
# These are still generous values:
net.core.rmem_default = 1048576 # 1MB
net.core.wmem_default = 1048576 # 1MB
net.core.rmem_max = 16777216 # 16MB
net.core.wmem_max = 16777216 # 16MB
########################
# Fragmentation & Time-Wait
########################
# Memory limits for IP fragment reassembly (bytes). Increased for robustness.
net.ipv4.ipfrag_high_thresh = 4194304 # 4MB
net.ipv4.ipfrag_low_thresh = 3145728 # 3MB
# Timeout for reassembling fragments (seconds).
net.ipv4.ipfrag_time = 30
# Time to hold a socket in FIN-WAIT-2 state. Default is 60.
# Lowering helps free up socket resources faster on busy servers.
net.ipv4.tcp_fin_timeout = 15
# tcp_tw_recycle: DO NOT ENABLE. Problematic with NAT, removed in kernels >= 4.12.
# Setting to 0 ensures it's off or does nothing on newer kernels.
net.ipv4.tcp_tw_recycle = 0
# tcp_tw_reuse: Allow reusing sockets in TIME-WAIT state for new OUTGOING connections.
# Requires tcp_timestamps=1. Generally safe and beneficial for nodes making many
# outgoing connections (e.g., to backend Pods). Changed from original 0.
net.ipv4.tcp_tw_reuse = 1
# Maximum number of sockets in TIME-WAIT state.
# High value for K8s due to high connection churn. Monitor actual usage.
net.ipv4.tcp_max_tw_buckets = 1440000 # Default is often much lower.
# TCP Keepalive settings: Detect dead connections.
net.ipv4.tcp_keepalive_time = 600 # Send first keepalive after 10 mins of idleness
net.ipv4.tcp_keepalive_intvl = 60 # Send subsequent probes every 60 seconds
net.ipv4.tcp_keepalive_probes = 5 # Declare connection dead after 5 failed probes
########################
# Routing & Bridge/NF (Essential for K8s)
########################
# Enable IP forwarding. Critical for K8s nodes.
net.ipv4.ip_forward = 1
# Pass bridged IPv4/IPv6 traffic through netfilter (iptables/nftables) hooks.
# CRITICAL for K8s CNI plugins and network policies to function correctly,
# even when IPVS is used for service load balancing.
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
# For IPv6 if used in the cluster:
# net.ipv6.conf.all.forwarding = 1
# net.ipv6.conf.default.forwarding = 1
########################
# IPVS (kube-proxy specific settings)
########################
# Enable IPVS connection tracking integration with netfilter conntrack.
# This is often required for network policies (Calico, Cilium, etc.) to correctly
# see and filter IPVS-handled traffic.
net.ipv4.vs.conntrack = 1
# Expire connections to a destination server when it's removed from the service.
# Helps in faster convergence when backend Pods are deleted.
net.ipv4.vs.expire_nodest_conn = 1
# Expire persistent templates when a destination server is quiesced (weight 0).
# Useful for graceful backend removal.
net.ipv4.vs.expire_quiescent_template = 1
# Be more lenient with TCP state transitions for IPVS. Can help with some clients/NAT.
net.ipv4.vs.sloppy_tcp = 1
# Be more lenient with UDP "connections" for IPVS.
net.ipv4.vs.sloppy_udp = 1
########################
# Conntrack (Netfilter Connection Tracking)
# Values depend heavily on cluster size, traffic patterns, and available RAM.
# Each conntrack entry uses ~300 bytes of non-swappable kernel memory.
########################
# Maximum number of connection tracking entries.
# 1,048,576 * ~300 bytes = ~300MB RAM. Adjust based on monitoring.
# Monitor with: cat /proc/sys/net/netfilter/nf_conntrack_count
net.netfilter.nf_conntrack_max = 1048576
# Timeout for established TCP connections (seconds). Default is 5 days (432000).
# 12 hours (43200) can help recycle entries faster but might drop long-lived idle connections.
net.netfilter.nf_conntrack_tcp_timeout_established = 43200
# Shorter timeouts for TCP connections in closing states.
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60
# TIME_WAIT timeout in conntrack should generally align with net.ipv4.tcp_fin_timeout
# or be slightly longer. Default is 120s.
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120
# Timeout for generic (non-TCP/UDP/ICMP) protocol entries.
net.netfilter.nf_conntrack_generic_timeout = 60
########################
# Security Hardening
########################
# Enable strict reverse path filtering to prevent IP spoofing
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1 # Set to 2 (loose) if asymmetric routing is used on the node itself.
# Disable acceptance of ICMP redirect messages (potential MITM vector)
net.ipv4.conf.all.accept_redirects = 0
net.ipv4.conf.default.accept_redirects = 0
# Disable acceptance of source routed packets (security risk)
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route = 0
# Log packets with impossible source addresses (martians)
net.ipv4.conf.all.log_martians = 1
net.ipv4.conf.default.log_martians = 1
# Ignore ICMP echo requests to broadcast/multicast addresses
net.ipv4.icmp_echo_ignore_broadcasts = 1
# Ignore all ICMP timestamp requests (minor security hardening)
net.ipv4.icmp_timestamp_ignore_all = 1
########################
# ARP Cache (Potentially useful for K8s with many pods/services on same L2)
########################
# Adjust ARP cache garbage collection thresholds if you have a very large number of
# active IP addresses on the same L2 network as the node.
# Threshold1: Soft limit, GC starts here.
# Threshold2: Hard limit, GC becomes more aggressive.
# Threshold3: Absolute max, entries might be dropped.
# Defaults are often 128, 512, 1024. Increase if ARP cache overflows are suspected.
# net.ipv4.neigh.default.gc_thresh1 = 1024
# net.ipv4.neigh.default.gc_thresh2 = 2048
# net.ipv4.neigh.default.gc_thresh3 = 4096
# /etc/sysctl.d/80-k8s.conf
# Optimized Kubernetes Node Network Tuning for kube-proxy native nftables mode
# Based on original by Kawin Viriyaprasopsook <[email protected]>
# Apply with: sysctl --system
########################
# Kernel & Memory
########################
# Reduce console noise from kernel messages
kernel.printk = 3 4 1 3
# Strongly prefer dropping caches over swapping out Pod/application memory.
# Kubernetes workloads should ideally have memory requests/limits set to avoid swapping.
vm.swappiness = 10 # Some even use 1 or 0 for K8s nodes if RAM is plentiful.
# Allow memory overcommit. '1' means the kernel will always approve memory requests.
# This can be useful for applications that request more memory than they immediately use.
# However, ensure your K8s resource requests/limits are well-defined to prevent
# actual memory exhaustion leading to OOM killer activity.
vm.overcommit_memory = 1
# System-wide limit for open file handles. High value is good for many containers.
fs.file-max = 2097152
# Maximum number of processes/threads. Increased for many pods/containers.
kernel.pid_max = 65536
# Consider also kernel.threads-max if pid_max is very aggressively increased,
# though pid_max usually governs overall process/thread count.
########################
# TCP Performance
########################
# Enable TCP window scaling, selective acknowledgements, and timestamps.
# These are standard for modern TCP and generally beneficial.
net.ipv4.tcp_window_scaling = 1
net.ipv4.tcp_sack = 1
net.ipv4.tcp_timestamps = 1 # Needed for tcp_tw_reuse, also helps RTT estimation.
# Enable TCP Fast Open (TFO). '3' means enable for client and server.
# Can reduce latency for repeated connections if supported by client and server.
net.ipv4.tcp_fastopen = 3
# Max SYN backlog. Increased to handle bursts of new connections to services.
net.ipv4.tcp_max_syn_backlog= 8192
# Enable TCP SYN cookies to help protect against SYN flood attacks.
net.ipv4.tcp_syncookies = 1
# Number of SYN retries before giving up. Lowering can speed up failure detection
# for unresponsive peers but might be too aggressive for lossy external networks.
# Default is often 5 or 6. For internal K8s traffic, 2 might be acceptable.
net.ipv4.tcp_syn_retries = 2
# Number of SYN+ACK retries. Similar considerations as tcp_syn_retries.
net.ipv4.tcp_synack_retries = 2
# Range of ephemeral ports for outgoing connections. Expands the available pool.
net.ipv4.ip_local_port_range= 1024 65535
# TCP congestion control. BBR is good for throughput and latency.
# Ensure 'bbr' module is available and loaded, and ideally use 'fq' qdisc.
net.ipv4.tcp_congestion_control = bbr
# Disable restarting TCP slow start after an idle period.
# Can improve performance for connections that are idle then burst data.
net.ipv4.tcp_slow_start_after_idle = 0
# Enable Path MTU discovery probing.
net.ipv4.tcp_mtu_probing = 1 # '0' (disabled), '1' (enabled only after ICMP black hole), '2' (always enabled)
# Do not save TCP metrics from closed connections. Can be useful if routes/paths change.
net.ipv4.tcp_no_metrics_save= 1
# Kernel attempts to aggregate small writes before sending (Nagle's algorithm related).
# Default is usually 1.
net.ipv4.tcp_autocorking = 1
# Socket send buffer low watermark (SO_SNDLOWAT). 16KB.
# This tells the kernel to try to ensure there's at least this much unsent data
# available for the NIC to DMA, potentially improving throughput for some drivers/NICs.
net.ipv4.tcp_notsent_lowat = 16384
########################
# Network Buffers & Qdisc
########################
# Maximum number of packets queued on the input side of a network interface.
# Very high value, good for high-speed NICs (10GbE+) to prevent drops.
net.core.netdev_max_backlog= 30000
# Default and maximum TCP receive/send buffer sizes.
# The original values (30MB/64MB) are very large and might consume excessive memory
# per socket, especially if many sockets don't need such large buffers.
# Moderating these while still being generous is often better.
# Linux auto-tuning (net.ipv4.tcp_moderate_rcvbuf=1 by default) usually works well.
# Setting very large fixed defaults can sometimes be counterproductive unless you have
# specific high BDP (Bandwidth-Delay Product) paths that require them.
# These are still generous values:
net.core.rmem_default = 1048576 # 1MB
net.core.wmem_default = 1048576 # 1MB
net.core.rmem_max = 16777216 # 16MB
net.core.wmem_max = 16777216 # 16MB
# Default queuing discipline. fq_codel is excellent for fairness and reducing bufferbloat.
# If using BBR, 'fq' is its canonical partner, but fq_codel is also a very strong choice.
net.core.default_qdisc = fq_codel
########################
# IP Fragmentation
########################
# Memory limits for IP fragment reassembly.
# These allow buffering more fragments if heavy fragmentation occurs.
# K8s networking (CNIs) generally tries to avoid fragmentation.
net.ipv4.ipfrag_high_thresh= 4194304 # Increased from 262144 (4MB)
net.ipv4.ipfrag_low_thresh = 3145728 # Increased from 196608 (3MB)
# Timeout for reassembling fragments (seconds).
net.ipv4.ipfrag_time = 30
########################
# TIME-WAIT & Keepalive
########################
# tcp_tw_recycle: DO NOT ENABLE. Problematic with NAT, removed in kernels >= 4.12.
# Setting to 0 ensures it's off or does nothing on newer kernels.
net.ipv4.tcp_tw_recycle = 0
# tcp_tw_reuse: Allow reusing sockets in TIME-WAIT state for new OUTGOING connections.
# Requires tcp_timestamps=1. Generally safe and beneficial for nodes making many
# outgoing connections (e.g., to other services, external APIs).
# Original had 0, changing to 1 is generally recommended for busy servers.
net.ipv4.tcp_tw_reuse = 1
# Maximum number of sockets in TIME-WAIT state.
# High value for K8s due to high connection churn. Monitor actual usage.
net.ipv4.tcp_max_tw_buckets= 1440000 # Default is often much lower.
# TCP Keepalive settings: Detect dead connections.
net.ipv4.tcp_keepalive_time= 600 # Send first keepalive after 10 mins of idleness
net.ipv4.tcp_keepalive_intvl= 60 # Send subsequent probes every 60 seconds
net.ipv4.tcp_keepalive_probes= 5 # Declare connection dead after 5 failed probes
########################
# Security & ICMP
########################
# Disable acceptance of ICMP redirect messages (potential MITM vector)
net.ipv4.conf.all.accept_redirects = 0
net.ipv4.conf.default.accept_redirects = 0
# Disable acceptance of source routed packets (security risk)
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route= 0
# Enable strict reverse path filtering to prevent IP spoofing
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1 # Set to 2 (loose) if asymmetric routing is used on the node itself.
# Log packets with impossible source addresses (martians)
net.ipv4.conf.all.log_martians = 1
# Ignore ICMP echo requests to broadcast/multicast addresses
net.ipv4.icmp_echo_ignore_broadcasts = 1
# Ignore all ICMP timestamp requests (minor security hardening)
net.ipv4.icmp_timestamp_ignore_all = 1
########################
# Routing & Bridge/nft (Essential for K8s)
########################
# Enable IP forwarding. Critical for K8s nodes.
net.ipv4.ip_forward = 1
# Pass bridged IPv4/IPv6 traffic through netfilter (iptables/nftables) hooks.
# CRITICAL for K8s CNI plugins and kube-proxy (even in nftables mode for its own rules)
# to apply network policies, NAT, service routing, etc.
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
# nftables uses the same L2 hooks as iptables for bridged traffic.
# For IPv6 if used in the cluster:
# net.ipv6.conf.all.forwarding = 1
# net.ipv6.conf.default.forwarding = 1
########################
# Conntrack (nftables mode relies on nf_conntrack)
# Values depend heavily on cluster size, traffic patterns, and available RAM.
# Each conntrack entry uses ~300 bytes of non-swappable kernel memory.
########################
# Maximum number of connection tracking entries.
# 1,048,576 * ~300 bytes = ~300MB RAM. Adjust based on monitoring.
net.netfilter.nf_conntrack_max = 1048576
# Monitor with: cat /proc/sys/net/netfilter/nf_conntrack_count
# Timeout for established TCP connections (seconds). Default is 5 days (432000).
# 12 hours (43200) can help recycle entries faster but might drop long-lived idle connections.
net.netfilter.nf_conntrack_tcp_timeout_established = 43200
# Shorter timeouts for TCP connections in closing states.
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 60
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 120 # Should align with TCP TIME_WAIT
# Timeout for generic (non-TCP/UDP/ICMP) protocol entries.
net.netfilter.nf_conntrack_generic_timeout = 60
########################
# ARP Cache (Potentially useful for K8s with many pods/services)
########################
# Adjust ARP cache garbage collection thresholds if you have a very large number of
# active IP addresses on the same L2 network as the node.
# Threshold1: Soft limit, GC starts here.
# Threshold2: Hard limit, GC becomes more aggressive.
# Threshold3: Absolute max, entries might be dropped.
# Defaults are often 128, 512, 1024. Increase if ARP cache overflows are suspected.
# net.ipv4.neigh.default.gc_thresh1 = 1024
# net.ipv4.neigh.default.gc_thresh2 = 2048
# net.ipv4.neigh.default.gc_thresh3 = 4096
# /etc/sysctl.d/80-pve.conf
# Optimized Proxmox VE Host Tuning
# Based on original by Kawin Viriyaprasopsook <[email protected]>
# Apply with: sysctl --system
########################
# Memory & VM Caching
########################
# Reduce console noise from kernel messages
kernel.printk = 3 4 1 3
# Strongly prefer dropping caches over swapping out application (VM) memory
vm.swappiness = 10
# Tend to keep dentry/inode caches longer, good for frequent file access (VM disks)
vm.vfs_cache_pressure = 50
# Max % of total memory for dirty pages before forcing synchronous writes
vm.dirty_ratio = 10
# % of total memory for dirty pages before background kernel flusher threads start writing
vm.dirty_background_ratio = 5
# For systems with very fast storage and lots of RAM, consider using
# vm.dirty_bytes and vm.dirty_background_bytes for more absolute control.
# Minimum amount of free RAM (in KB) the system should maintain (128MB)
# Increase if you have very large amounts of RAM (e.g., 256GB+)
vm.min_free_kbytes = 131072
# Maximum number of memory map areas a process can have
# Increased for some containerized workloads (e.g., Elasticsearch) or complex applications
vm.max_map_count = 262144
########################
# File Handles & PIDs
########################
# System-wide limit for open file handles
fs.file-max = 2097152
# Maximum number of processes/threads the system can have
# Increased from default (often 32768) for busy virtualization hosts
kernel.pid_max = 65536
# Consider kernel.threads-max as well if pid_max is significantly increased,
# though pid_max often covers thread limits too.
########################
# Networking (Bridges & Forwarding)
########################
# Enable IP forwarding for guest routing/NAT
net.ipv4.ip_forward = 1
# Pass bridged traffic through the host's iptables/ip6tables/arptables chains.
# REQUIRED if using Proxmox VE firewall for VMs on bridges.
# Set to 0 if NOT using PVE firewall for bridged VMs and security is handled elsewhere,
# for a slight performance gain by bypassing host netfilter for bridged packets.
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-call-arptables = 1
########################
# TCP & Socket Queues
########################
# Maximum number of connection requests queued for a listening socket (listen() backlog)
net.core.somaxconn = 4096
# Maximum number of packets queued on the input side of a network interface
# when the interface receives packets faster than the kernel can process them.
# Good for 10GbE+ or busy 1GbE. Monitor for rx_dropped.
net.core.netdev_max_backlog = 5000
# Default and maximum TCP receive buffer size (bytes)
net.core.rmem_default = 262144
net.core.rmem_max = 16777216
# Default and maximum TCP send buffer size (bytes)
net.core.wmem_default = 262144
net.core.wmem_max = 16777216
# Maximum number of remembered connection requests not yet acknowledged (SYN_RECV state)
net.ipv4.tcp_max_syn_backlog= 2048
# Enable TCP SYN cookies to help protect against SYN flood attacks
net.ipv4.tcp_syncookies = 1
# Time to hold a socket in FIN-WAIT-2 state. Default is 60.
# Lowering helps free up socket resources faster on busy servers.
net.ipv4.tcp_fin_timeout = 15
# Allow reusing sockets in TIME-WAIT state for new outgoing connections. Generally safe.
net.ipv4.tcp_tw_reuse = 1
# DO NOT ENABLE tcp_tw_recycle. It's problematic with NAT and removed in kernels >= 4.12.
# Setting to 0 ensures it's off if on an older kernel, or does nothing on newer ones.
net.ipv4.tcp_tw_recycle = 0
# Range of ephemeral ports for outgoing connections. Expands the available pool.
net.ipv4.ip_local_port_range= 1024 65535
# TCP congestion control algorithm. BBR often improves throughput and latency.
# Ensure 'bbr' module is available and loaded. 'cubic' is the older default.
# Check availability: sysctl net.ipv4.tcp_available_congestion_control
net.ipv4.tcp_congestion_control = bbr
########################
# Conntrack (Connection Tracking - for Netfilter/iptables)
# Adjust based on actual usage (cat /proc/sys/net/netfilter/nf_conntrack_count)
# Each entry uses ~300 bytes of non-swappable kernel memory.
########################
# Maximum number of connection tracking entries.
net.netfilter.nf_conntrack_max = 262144
# How long to keep an established TCP connection in the tracking table (seconds, 1 day)
net.netfilter.nf_conntrack_tcp_timeout_established = 86400
# Timeout for generic (non-TCP/UDP/ICMP) protocol entries (seconds, 5 minutes)
net.netfilter.nf_conntrack_generic_timeout = 300
########################
# Security Hardening
########################
# Enable strict reverse path filtering to prevent IP spoofing
net.ipv4.conf.all.rp_filter = 1
net.ipv4.conf.default.rp_filter = 1
# Disable acceptance of ICMP redirect messages (potential MITM vector)
net.ipv4.conf.all.accept_redirects= 0
net.ipv4.conf.default.accept_redirects= 0
# Disable acceptance of source routed packets (security risk)
net.ipv4.conf.all.accept_source_route = 0
net.ipv4.conf.default.accept_source_route = 0
# Log packets with impossible addresses (martians)
net.ipv4.conf.all.log_martians = 1
net.ipv4.conf.default.log_martians = 1
# For IPv6, similar hardening can be applied if IPv6 is actively used:
# net.ipv6.conf.all.accept_redirects = 0
# net.ipv6.conf.default.accept_redirects = 0
# net.ipv6.conf.all.accept_source_route = 0
# net.ipv6.conf.default.accept_source_route = 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment