summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.txt22
-rw-r--r--include/linux/tcp.h6
-rw-r--r--include/net/tcp.h33
-rw-r--r--include/uapi/linux/snmp.h6
-rw-r--r--net/ipv4/proc.c6
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp_input.c30
-rw-r--r--net/ipv4/tcp_minisocks.c36
8 files changed, 133 insertions, 13 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a5e4c813f17f..1b8c964b0d17 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -290,6 +290,28 @@ tcp_frto - INTEGER
By default it's enabled with a non-zero value. 0 disables F-RTO.
+tcp_invalid_ratelimit - INTEGER
+ Limit the maximal rate for sending duplicate acknowledgments
+ in response to incoming TCP packets that are for an existing
+ connection but that are invalid due to any of these reasons:
+
+ (a) out-of-window sequence number,
+ (b) out-of-window acknowledgment number, or
+ (c) PAWS (Protection Against Wrapped Sequence numbers) check failure
+
+ This can help mitigate simple "ack loop" DoS attacks, wherein
+ a buggy or malicious middlebox or man-in-the-middle can
+ rewrite TCP header fields in manner that causes each endpoint
+ to think that the other is sending invalid TCP segments, thus
+ causing each side to send an unterminating stream of duplicate
+ acknowledgments for invalid segments.
+
+ Using 0 disables rate-limiting of dupacks in response to
+ invalid segments; otherwise this value specifies the minimal
+ space between sending such dupacks, in milliseconds.
+
+ Default: 500 (milliseconds).
+
tcp_keepalive_time - INTEGER
How often TCP sends out keepalive messages when keepalive is enabled.
Default: 2hours.
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 67309ece0772..1a7adb411647 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -115,6 +115,7 @@ struct tcp_request_sock {
u32 rcv_isn;
u32 snt_isn;
u32 snt_synack; /* synack sent time */
+ u32 last_oow_ack_time; /* last SYNACK */
u32 rcv_nxt; /* the ack # by SYNACK. For
* FastOpen it's the seq#
* after data-in-SYN.
@@ -152,6 +153,7 @@ struct tcp_sock {
u32 snd_sml; /* Last byte of the most recently transmitted small packet */
u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */
u32 lsndtime; /* timestamp of last sent data packet (for restart window) */
+ u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */
u32 tsoffset; /* timestamp offset */
@@ -340,6 +342,10 @@ struct tcp_timewait_sock {
u32 tw_rcv_wnd;
u32 tw_ts_offset;
u32 tw_ts_recent;
+
+ /* The time we sent the last out-of-window ACK: */
+ u32 tw_last_oow_ack_time;
+
long tw_ts_recent_stamp;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *tw_md5_key;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 28e9bd3abceb..da4196fb78db 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -274,6 +274,7 @@ extern int sysctl_tcp_challenge_ack_limit;
extern unsigned int sysctl_tcp_notsent_lowat;
extern int sysctl_tcp_min_tso_segs;
extern int sysctl_tcp_autocorking;
+extern int sysctl_tcp_invalid_ratelimit;
extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
@@ -1144,6 +1145,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
tcp_rsk(req)->snt_synack = tcp_time_stamp;
+ tcp_rsk(req)->last_oow_ack_time = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
@@ -1236,6 +1238,37 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt,
return true;
}
+/* Return true if we're currently rate-limiting out-of-window ACKs and
+ * thus shouldn't send a dupack right now. We rate-limit dupacks in
+ * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
+ * attacks that send repeated SYNs or ACKs for the same connection. To
+ * do this, we do not send a duplicate SYNACK or ACK if the remote
+ * endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
+ */
+static inline bool tcp_oow_rate_limited(struct net *net,
+ const struct sk_buff *skb,
+ int mib_idx, u32 *last_oow_ack_time)
+{
+ /* Data packets without SYNs are not likely part of an ACK loop. */
+ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
+ !tcp_hdr(skb)->syn)
+ goto not_rate_limited;
+
+ if (*last_oow_ack_time) {
+ s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
+
+ if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+ NET_INC_STATS_BH(net, mib_idx);
+ return true; /* rate-limited: don't send yet! */
+ }
+ }
+
+ *last_oow_ack_time = tcp_time_stamp;
+
+not_rate_limited:
+ return false; /* not rate-limited: go ahead, send dupack now! */
+}
+
static inline void tcp_mib_init(struct net *net)
{
/* See RFC 2012 */
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index b22224100011..6a6fb747c78d 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -270,6 +270,12 @@ enum
LINUX_MIB_TCPHYSTARTTRAINCWND, /* TCPHystartTrainCwnd */
LINUX_MIB_TCPHYSTARTDELAYDETECT, /* TCPHystartDelayDetect */
LINUX_MIB_TCPHYSTARTDELAYCWND, /* TCPHystartDelayCwnd */
+ LINUX_MIB_TCPACKSKIPPEDSYNRECV, /* TCPACKSkippedSynRecv */
+ LINUX_MIB_TCPACKSKIPPEDPAWS, /* TCPACKSkippedPAWS */
+ LINUX_MIB_TCPACKSKIPPEDSEQ, /* TCPACKSkippedSeq */
+ LINUX_MIB_TCPACKSKIPPEDFINWAIT2, /* TCPACKSkippedFinWait2 */
+ LINUX_MIB_TCPACKSKIPPEDTIMEWAIT, /* TCPACKSkippedTimeWait */
+ LINUX_MIB_TCPACKSKIPPEDCHALLENGE, /* TCPACKSkippedChallenge */
__LINUX_MIB_MAX
};
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 8f9cd200ce20..d8953ef0770c 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -292,6 +292,12 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND),
SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT),
SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND),
+ SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV),
+ SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS),
+ SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ),
+ SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2),
+ SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT),
+ SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index e0ee384a448f..82601a68cf90 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -729,6 +729,13 @@ static struct ctl_table ipv4_table[] = {
.extra2 = &one,
},
{
+ .procname = "tcp_invalid_ratelimit",
+ .data = &sysctl_tcp_invalid_ratelimit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
.procname = "icmp_msgs_per_sec",
.data = &sysctl_icmp_msgs_per_sec,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d3dfff78fa19..8fdd27b17306 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -100,6 +100,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_early_retrans __read_mostly = 3;
+int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -3321,13 +3322,22 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
}
/* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk)
+static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
{
/* unprotected vars, we dont care of overwrites */
static u32 challenge_timestamp;
static unsigned int challenge_count;
- u32 now = jiffies / HZ;
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 now;
+
+ /* First check our per-socket dupack rate limit. */
+ if (tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+ &tp->last_oow_ack_time))
+ return;
+ /* Then check the check host-wide RFC 5961 rate limit. */
+ now = jiffies / HZ;
if (now != challenge_timestamp) {
challenge_timestamp = now;
challenge_count = 0;
@@ -3423,7 +3433,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (before(ack, prior_snd_una)) {
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
if (before(ack, prior_snd_una - tp->max_window)) {
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, skb);
return -1;
}
goto old_ack;
@@ -4992,7 +5002,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
- tcp_send_dupack(sk, skb);
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDPAWS,
+ &tp->last_oow_ack_time))
+ tcp_send_dupack(sk, skb);
goto discard;
}
/* Reset is accepted even if it did not pass PAWS. */
@@ -5009,7 +5022,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (!th->rst) {
if (th->syn)
goto syn_challenge;
- tcp_send_dupack(sk, skb);
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDSEQ,
+ &tp->last_oow_ack_time))
+ tcp_send_dupack(sk, skb);
}
goto discard;
}
@@ -5025,7 +5041,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
tcp_reset(sk);
else
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, skb);
goto discard;
}
@@ -5039,7 +5055,7 @@ syn_challenge:
if (syn_inerr)
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, skb);
goto discard;
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index bc9216dc9de1..dd11ac7798c6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -58,6 +58,25 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
return seq == e_win && seq == end_seq;
}
+static enum tcp_tw_status
+tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
+ const struct sk_buff *skb, int mib_idx)
+{
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+ if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
+ &tcptw->tw_last_oow_ack_time)) {
+ /* Send ACK. Note, we do not put the bucket,
+ * it will be released by caller.
+ */
+ return TCP_TW_ACK;
+ }
+
+ /* We are rate-limiting, so just release the tw sock and drop skb. */
+ inet_twsk_put(tw);
+ return TCP_TW_SUCCESS;
+}
+
/*
* * Main purpose of TIME-WAIT state is to close connection gracefully,
* when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -116,7 +135,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
tcptw->tw_rcv_nxt,
tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
- return TCP_TW_ACK;
+ return tcp_timewait_check_oow_rate_limit(
+ tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
if (th->rst)
goto kill;
@@ -250,10 +270,8 @@ kill:
inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
TCP_TIMEWAIT_LEN);
- /* Send ACK. Note, we do not put the bucket,
- * it will be released by caller.
- */
- return TCP_TW_ACK;
+ return tcp_timewait_check_oow_rate_limit(
+ tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
}
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
@@ -289,6 +307,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
tcptw->tw_ts_offset = tp->tsoffset;
+ tcptw->tw_last_oow_ack_time = 0;
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
@@ -467,6 +486,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_enable_early_retrans(newtp);
newtp->tlp_high_seq = 0;
newtp->lsndtime = treq->snt_synack;
+ newtp->last_oow_ack_time = 0;
newtp->total_retrans = req->num_retrans;
/* So many TCP implementations out there (incorrectly) count the
@@ -605,7 +625,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* Reset timer after retransmitting SYNACK, similar to
* the idea of fast retransmit in recovery.
*/
- if (!inet_rtx_syn_ack(sk, req))
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+ &tcp_rsk(req)->last_oow_ack_time) &&
+
+ !inet_rtx_syn_ack(sk, req))
req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
TCP_RTO_MAX) + jiffies;
return NULL;