diff --git a/include/linux/tcp.h b/include/linux/tcp.h index f28408c..f9db46e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -95,6 +95,9 @@ struct tcp_options_received { cookie_in_always:1; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ +#ifdef CONFIG_TCP_ESTATS + u16 rec_mss; /* MSS option received */ +#endif u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ }; @@ -135,6 +138,10 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) return (struct tcp_request_sock *)req; } +#ifdef CONFIG_TCP_ESTATS +struct tcp_estats; +#endif + struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; @@ -326,6 +333,10 @@ struct tcp_sock { */ struct tcp_cookie_values *cookie_values; +#ifdef CONFIG_TCP_ESTATS + struct tcp_estats *tcp_stats; +#endif + /* TCP fastopen related information */ struct tcp_fastopen_request *fastopen_req; /* fastopen_rsk points to request_sock that resulted in this big diff --git a/include/net/tcp.h b/include/net/tcp.h index a345480..4b2b852 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include diff --git a/include/net/tcp_estats.h b/include/net/tcp_estats.h new file mode 100644 index 0000000..2c73a88 --- /dev/null +++ b/include/net/tcp_estats.h @@ -0,0 +1,361 @@ +/* + * include/net/tcp_estats.h + * + * Implementation of TCP ESTATS MIB (RFC 4898) + * + * Authors: + * John Estabrook + * Andrew K. Adams + * John Heffner + * Matt Mathis + * Jeff Semke + * + * The Web10Gig project. See http://www.web10gig.org + * + * Copyright © 2011, Pittsburgh Supercomputing Center (PSC) and + * National Center for Supercomputing Applications (NCSA). + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _TCP_ESTATS_H +#define _TCP_ESTATS_H + +#include +#include +#include +#include +#include +#include + +enum tcp_estats_sndlim_states { + TCP_ESTATS_SNDLIM_NONE = -1, + TCP_ESTATS_SNDLIM_SENDER, + TCP_ESTATS_SNDLIM_CWND, + TCP_ESTATS_SNDLIM_RWIN, + TCP_ESTATS_SNDLIM_STARTUP, + TCP_ESTATS_SNDLIM_TSODEFER, + TCP_ESTATS_SNDLIM_NSTATES /* Keep at end */ +}; + +enum tcp_estats_addrtype { + TCP_ESTATS_ADDRTYPE_IPV4 = 1, + TCP_ESTATS_ADDRTYPE_IPV6 = 2 +}; + +enum tcp_estats_softerror_reason { + TCP_ESTATS_SOFTERROR_BELOW_DATA_WINDOW = 1, + TCP_ESTATS_SOFTERROR_ABOVE_DATA_WINDOW = 2, + TCP_ESTATS_SOFTERROR_BELOW_ACK_WINDOW = 3, + TCP_ESTATS_SOFTERROR_ABOVE_ACK_WINDOW = 4, + TCP_ESTATS_SOFTERROR_BELOW_TS_WINDOW = 5, + TCP_ESTATS_SOFTERROR_ABOVE_TS_WINDOW = 6, + TCP_ESTATS_SOFTERROR_DATA_CHECKSUM = 7, + TCP_ESTATS_SOFTERROR_OTHER = 8, +}; + +#define TCP_ESTATS_INACTIVE 0 +#define TCP_ESTATS_ACTIVE 1 + +#define TCP_ESTATS_TABLEMASK_INACTIVE 0x00 +#define TCP_ESTATS_TABLEMASK_ACTIVE 0x01 +#define TCP_ESTATS_TABLEMASK_PERF 0x02 +#define TCP_ESTATS_TABLEMASK_PATH 0x04 +#define TCP_ESTATS_TABLEMASK_STACK 0x08 +#define TCP_ESTATS_TABLEMASK_APP 0x10 +#define TCP_ESTATS_TABLEMASK_TUNE 0x20 +#define TCP_ESTATS_TABLEMASK_EXTRAS 0x40 + +#ifdef CONFIG_TCP_ESTATS + +extern struct static_key tcp_estats_enabled; +#define TCP_ESTATS_CHECK(tp, table, expr) \ + do { \ + if (static_key_false(&tcp_estats_enabled)) { \ + if (likely((tp)->tcp_stats) && \ + likely((tp)->tcp_stats->tables.table)) { \ + (expr); \ + } \ + } \ + } while (0) + +#define TCP_ESTATS_VAR_INC(tp, table, var) \ + TCP_ESTATS_CHECK(tp, table, ++((tp)->tcp_stats->tables.table->var)) +#define TCP_ESTATS_VAR_DEC(tp, table, var) \ + TCP_ESTATS_CHECK(tp, table, --((tp)->tcp_stats->tables.table->var)) +#define TCP_ESTATS_VAR_ADD(tp, table, var, val) \ + TCP_ESTATS_CHECK(tp, table, \ + ((tp)->tcp_stats->tables.table->var) += (val)) +#define TCP_ESTATS_VAR_SUB(tp, table, var, val) \ + TCP_ESTATS_CHECK(tp, table, \ + ((tp)->tcp_stats->tables.table->var) -= (val)) +#define TCP_ESTATS_VAR_SET(tp, table, var, val) \ + TCP_ESTATS_CHECK(tp, table, \ + ((tp)->tcp_stats->tables.table->var) = (val)) +#define TCP_ESTATS_UPDATE(tp, func) \ + do { \ + if (static_key_false(&tcp_estats_enabled)) { \ + if (likely((tp)->tcp_stats)) { \ + (func); \ + } \ + } \ + } while (0) + +/* + * Variables that can be read and written directly. + * + * Contains all variables from RFC 4898. Commented fields have + * external handlers and do not need struct storage. + */ +struct tcp_estats_connection_table { + /* Connection table */ + u32 AddressType; + struct { u8 data[16]; } LocalAddress; + struct { u8 data[16]; } RemAddress; + u16 LocalPort; + u16 RemPort; +}; + +struct tcp_estats_perf_table { + u32 SegsOut; + u32 DataSegsOut; + u64 DataOctetsOut; + u32 SegsRetrans; + u32 OctetsRetrans; + u32 SegsIn; + u32 DataSegsIn; + u64 DataOctetsIn; + /* ElapsedSecs */ + /* ElapsedMicroSecs */ + /* StartTimeStamp */ + /* CurMSS */ + /* PipeSize */ + u32 MaxPipeSize; + /* SmoothedRTT */ + /* CurRTO */ + u32 CongSignals; + /* CurCwnd */ + /* CurSsthresh */ + u32 Timeouts; + /* CurRwinSent */ + u32 MaxRwinSent; + u32 ZeroRwinSent; + /* CurRwinRcvd */ + u32 MaxRwinRcvd; + u32 ZeroRwinRcvd; + /* SndLimTransRwin */ + /* SndLimTransCwnd */ + /* SndLimTransSnd */ + /* SndLimTimeRwin */ + /* SndLimTimeCwnd */ + /* SndLimTimeSnd */ + u32 snd_lim_trans[TCP_ESTATS_SNDLIM_NSTATES]; + u32 snd_lim_time[TCP_ESTATS_SNDLIM_NSTATES]; +}; + +struct tcp_estats_path_table { + /* RetranThresh */ + u32 NonRecovDAEpisodes; + u32 SumOctetsReordered; + u32 NonRecovDA; + u32 SampleRTT; + /* RTTVar */ + u32 MaxRTT; + u32 MinRTT; + u64 SumRTT; + u32 CountRTT; + u32 MaxRTO; + u32 MinRTO; + u8 IpTtl; + u8 IpTosIn; + /* IpTosOut */ + u32 PreCongSumCwnd; + u32 PreCongSumRTT; + u32 PostCongSumRTT; + u32 PostCongCountRTT; + u32 ECNsignals; + u32 DupAckEpisodes; + /* RcvRTT */ + u32 DupAcksOut; + u32 CERcvd; + u32 ECESent; +}; + +struct tcp_estats_stack_table { + u32 ActiveOpen; + /* MSSSent */ + /* MSSRcvd */ + /* WinScaleSent */ + /* WinScaleRcvd */ + /* TimeStamps */ + /* ECN */ + /* WillSendSACK */ + /* WillUseSACK */ + /* State */ + /* Nagle */ + u32 MaxSsCwnd; + u32 MaxCaCwnd; + u32 MaxSsthresh; + u32 MinSsthresh; + /* InRecovery */ + u32 DupAcksIn; + u32 SpuriousFrDetected; + u32 SpuriousRtoDetected; + u32 SoftErrors; + u32 SoftErrorReason; + u32 SlowStart; + u32 CongAvoid; + u32 OtherReductions; + u32 CongOverCount; + u32 FastRetran; + u32 SubsequentTimeouts; + /* CurTimeoutCount */ + u32 AbruptTimeouts; + u32 SACKsRcvd; + u32 SACKBlocksRcvd; + u32 SendStall; + u32 DSACKDups; + u32 MaxMSS; + u32 MinMSS; + u32 SndInitial; + u32 RecInitial; + u32 CurRetxQueue; + u32 MaxRetxQueue; + /* CurReasmQueue */ + u32 MaxReasmQueue; + u32 EarlyRetrans; + u32 EarlyRetransDelay; +}; + +struct tcp_estats_app_table { + /* SndUna */ + /* SndNxt */ + u32 SndMax; + u64 ThruOctetsAcked; + /* RcvNxt */ + u64 ThruOctetsReceived; + /* CurAppWQueue */ + u32 MaxAppWQueue; + /* CurAppRQueue */ + u32 MaxAppRQueue; +}; + +struct tcp_estats_tune_table { + /* LimCwnd */ + u32 LimSsthresh; + /* LimRwin */ + /* LimMSS */ +}; + +struct tcp_estats_extras_table { + u32 OtherReductionsCV; + u32 OtherReductionsCM; +}; + +struct tcp_estats_tables { + struct tcp_estats_connection_table *connection_table; + struct tcp_estats_perf_table *perf_table; + struct tcp_estats_path_table *path_table; + struct tcp_estats_stack_table *stack_table; + struct tcp_estats_app_table *app_table; + struct tcp_estats_tune_table *tune_table; + struct tcp_estats_extras_table *extras_table; +}; + +struct tcp_estats { + int tcpe_cid; // idr map id + + struct sock *sk; + kuid_t uid; + kgid_t gid; + int ids; + + atomic_t users; + + int limstate; + ktime_t limstate_ts; + ktime_t start_ts; + ktime_t current_ts; + struct timeval start_tv; + + int queued; + struct work_struct create_notify; + struct work_struct establish_notify; + struct delayed_work destroy_notify; + + struct tcp_estats_tables tables; +}; + +extern struct idr tcp_estats_idr; + +extern int tcp_estats_wq_enabled; +extern struct workqueue_struct *tcp_estats_wq; +extern void (*create_notify_func)(struct work_struct *work); +extern void (*establish_notify_func)(struct work_struct *work); +extern void (*destroy_notify_func)(struct work_struct *work); + +extern unsigned long persist_delay; +extern spinlock_t tcp_estats_idr_lock; + +/* For the TCP code */ +extern int tcp_estats_create(struct sock *sk, enum tcp_estats_addrtype t, + int active); +extern void tcp_estats_destroy(struct sock *sk); +extern void tcp_estats_free(struct tcp_estats *stats); +extern void tcp_estats_establish(struct sock *sk); + +extern void tcp_estats_update_snd_nxt(struct tcp_sock *tp); +extern void tcp_estats_update_acked(struct tcp_sock *tp, u32 ack); +extern void tcp_estats_update_rtt(struct sock *sk, unsigned long rtt_sample); +extern void tcp_estats_update_timeout(struct sock *sk); +extern void tcp_estats_update_mss(struct tcp_sock *tp); +extern void tcp_estats_update_rwin_rcvd(struct tcp_sock *tp); +extern void tcp_estats_update_sndlim(struct tcp_sock *tp, + enum tcp_estats_sndlim_states why); +extern void tcp_estats_update_rcvd(struct tcp_sock *tp, u32 seq); +extern void tcp_estats_update_rwin_sent(struct tcp_sock *tp); +extern void tcp_estats_update_congestion(struct tcp_sock *tp); +extern void tcp_estats_update_post_congestion(struct tcp_sock *tp); +extern void tcp_estats_update_segsend(struct sock *sk, int len, int pcount, + u32 seq, u32 end_seq, int flags); +extern void tcp_estats_update_segrecv(struct tcp_sock *tp, struct sk_buff *skb); +extern void tcp_estats_update_finish_segrecv(struct tcp_sock *tp); +extern void tcp_estats_update_writeq(struct sock *sk); +extern void tcp_estats_update_recvq(struct sock *sk); + +extern void tcp_estats_init(void); + +static inline void tcp_estats_use(struct tcp_estats *stats) +{ + atomic_inc(&stats->users); +} + +static inline void tcp_estats_unuse(struct tcp_estats *stats) +{ + if (atomic_dec_and_test(&stats->users)) + tcp_estats_free(stats); +} + +#else /* !CONFIG_TCP_ESTATS */ + +#define tcp_estats_enabled (0) + +#define TCP_ESTATS_VAR_INC(tp, table, var) do {} while (0) +#define TCP_ESTATS_VAR_DEC(tp, table, var) do {} while (0) +#define TCP_ESTATS_VAR_SET(tp, table, var,val) do {} while (0) +#define TCP_ESTATS_VAR_ADD(tp, table, var,val) do {} while (0) +#define TCP_ESTATS_UPDATE(tp, func) do {} while (0) + +static inline void tcp_estats_init(void) { } +static inline void tcp_estats_establish(struct sock *sk) { } +static inline void tcp_estats_create(struct sock *sk, + enum tcp_estats_addrtype t, + int active) { } +static inline void tcp_estats_destroy(struct sock *sk) { } + +#endif /* CONFIG_TCP_ESTATS */ + +#endif /* _TCP_ESTATS_H */ diff --git a/include/net/tcp_estats_mib_var.h b/include/net/tcp_estats_mib_var.h new file mode 100644 index 0000000..a18e13c --- /dev/null +++ b/include/net/tcp_estats_mib_var.h @@ -0,0 +1,327 @@ +#ifndef _TCP_ESTATS_MIB_VAR_H_ +#define _TCP_ESTATS_MIB_VAR_H_ + +#ifdef __KERNEL__ +#include +#include +#include +#include +#else +#include +#include +#endif + +#ifdef CONFIG_TCP_ESTATS + +union estats_val { + __u64 o; + __u32 t; + __s32 s; + __u16 w; + __u8 b; +}; + +enum MIB_TABLE { + PERF_TABLE, + PATH_TABLE, + STACK_TABLE, + APP_TABLE, + TUNE_TABLE, + EXTRAS_TABLE, + __MAX_TABLE +}; +#define MAX_TABLE __MAX_TABLE + +extern int max_index[]; /* MAX_TABLE */ + +/* The official MIB states are enumerated differently than Linux's. */ +enum tcp_estats_states { + TCP_ESTATS_STATE_CLOSED = 1, + TCP_ESTATS_STATE_LISTEN, + TCP_ESTATS_STATE_SYNSENT, + TCP_ESTATS_STATE_SYNRECEIVED, + TCP_ESTATS_STATE_ESTABLISHED, + TCP_ESTATS_STATE_FINWAIT1, + TCP_ESTATS_STATE_FINWAIT2, + TCP_ESTATS_STATE_CLOSEWAIT, + TCP_ESTATS_STATE_LASTACK, + TCP_ESTATS_STATE_CLOSING, + TCP_ESTATS_STATE_TIMEWAIT, + TCP_ESTATS_STATE_DELETECB +}; + +struct tcp_estats_connection_spec { + uint8_t rem_addr[16]; + uint8_t local_addr[16]; + uint8_t addr_type; + uint16_t rem_port; + uint16_t local_port; +}; + +enum TCP_ESTATS_TYPE { + TCP_ESTATS_UNSIGNED64, + TCP_ESTATS_UNSIGNED32, + TCP_ESTATS_SIGNED32, + TCP_ESTATS_UNSIGNED16, + TCP_ESTATS_UNSIGNED8, +}; + +struct tcp_estats_var; +typedef void (*estats_rwfunc_t)(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp); + +struct tcp_estats_var { + char *name; + u32 type; + char *table; + + estats_rwfunc_t read; + unsigned long read_data; + + estats_rwfunc_t write; + unsigned long write_data; +}; + +extern struct tcp_estats_var perf_var_array[]; +extern struct tcp_estats_var path_var_array[]; +extern struct tcp_estats_var stack_var_array[]; +extern struct tcp_estats_var app_var_array[]; +extern struct tcp_estats_var tune_var_array[]; +extern struct tcp_estats_var extras_var_array[]; + +extern struct tcp_estats_var *estats_var_array[]; + +static inline int single_index(int inda, int indb) +{ + int ret = indb; + int i; + + if (inda > 0) { + for (i = 0; i < inda; i++) { + ret += max_index[i]; + } + } + return ret; +} + +static inline void read_tcp_estats(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + vp->read(buf, stats, vp); +} + +static inline int write_tcp_estats(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + if (vp->write != NULL) { + vp->write(buf, stats, vp); + return 0; + } + return -1; +} + +static inline int tcp_estats_var_len(struct tcp_estats_var *vp) +{ + switch (vp->type) { + case TCP_ESTATS_UNSIGNED64: + return 8; + case TCP_ESTATS_UNSIGNED32: + return 4; + case TCP_ESTATS_SIGNED32: + return 4; + case TCP_ESTATS_UNSIGNED16: + return 2; + case TCP_ESTATS_UNSIGNED8: + return 1; + } + + printk(KERN_WARNING + "TCP ESTATS: Adding variable of unknown type %d.\n", vp->type); + return 0; +} + +void tcp_estats_find_var_by_iname(struct tcp_estats_var **, const char *); + +void tcp_estats_read_connection_spec(struct tcp_estats_connection_spec *, + struct tcp_estats *); + +typedef enum ESTATS_PERF_INDEX { + SEGSOUT = 0, + DATASEGSOUT, + DATAOCTETSOUT, + HCDATAOCTETSOUT, + SEGSRETRANS, + OCTETSRETRANS, + SEGSIN, + DATASEGSIN, + DATAOCTETSIN, + HCDATAOCTETSIN, + ELAPSEDSECS, + ELAPSEDMICROSECS, + STARTTIMESTAMP, + CURMSS, + PIPESIZE, + MAXPIPESIZE, + SMOOTHEDRTT, + CURRTO, + CONGSIGNALS, + CURCWND, + CURSSTHRESH, + TIMEOUTS, + CURRWINSENT, + MAXRWINSENT, + ZERORWINSENT, + CURRWINRCVD, + MAXRWINRCVD, + ZERORWINRCVD, + SNDLIMTRANSRWIN, + SNDLIMTRANSCWND, + SNDLIMTRANSSND, + SNDLIMTRANSTSODEFER, + SNDLIMTIMERWIN, + SNDLIMTIMECWND, + SNDLIMTIMESND, + SNDLIMTIMETSODEFER, + __PERF_INDEX_MAX +} ESTATS_PERF_INDEX; +#define PERF_INDEX_MAX __PERF_INDEX_MAX + +typedef enum ESTATS_PATH_INDEX { + RETRANTHRESH, + NONRECOVDAEPISODES, + SUMOCTETSREORDERED, + NONRECOVDA, + SAMPLERTT, + RTTVAR, + MAXRTT, + MINRTT, + SUMRTT, + HCSUMRTT, + COUNTRTT, + MAXRTO, + MINRTO, + IPTTL, + IPTOSIN, + IPTOSOUT, + PRECONGSUMCWND, + PRECONGSUMRTT, + POSTCONGSUMRTT, + POSTCONGCOUNTRTT, + ECNSIGNALS, + DUPACKEPISODES, + RCVRTT, + DUPACKSOUT, + CERCVD, + ECESENT, + __PATH_INDEX_MAX +} ESTATS_PATH_INDEX; +#define PATH_INDEX_MAX __PATH_INDEX_MAX + +typedef enum ESTATS_STACK_INDEX { + ACTIVEOPEN, + MSSSENT, + MSSRCVD, + WINSCALESENT, + WINSCALERCVD, + TIMESTAMPS, + ECN, + WILLSENDSACK, + WILLUSESACK, + STATE, + NAGLE, + MAXSSCWND, + MAXCACWND, + MAXSSTHRESH, + MINSSTHRESH, + INRECOVERY, + DUPACKSIN, + SPURIOUSFRDETECTED, + SPURIOUSRTODETECTED, + SOFTERRORS, + SOFTERRORREASON, + SLOWSTART, + CONGAVOID, + OTHERREDUCTIONS, + CONGOVERCOUNT, + FASTRETRAN, + SUBSEQUENTTIMEOUTS, + CURTIMEOUTCOUNT, + ABRUPTTIMEOUTS, + SACKSRCVD, + SACKBLOCKSRCVD, + SENDSTALL, + DSACKDUPS, + MAXMSS, + MINMSS, + SNDINITIAL, + RECINITIAL, + CURRETXQUEUE, + MAXRETXQUEUE, + CURREASMQUEUE, + MAXREASMQUEUE, + EARLYRETRANS, + EARLYRETRANSDELAY, + __STACK_INDEX_MAX +} ESTATS_STACK_INDEX; +#define STACK_INDEX_MAX __STACK_INDEX_MAX + +typedef enum ESTATS_APP_INDEX { + SNDUNA, + SNDNXT, + SNDMAX, + THRUOCTETSACKED, + HCTHRUOCTETSACKED, + RCVNXT, + THRUOCTETSRECEIVED, + HCTHRUOCTETSRECEIVED, + CURAPPWQUEUE, + MAXAPPWQUEUE, + CURAPPRQUEUE, + MAXAPPRQUEUE, + __APP_INDEX_MAX +} ESTATS_APP_INDEX; +#define APP_INDEX_MAX __APP_INDEX_MAX + +typedef enum ESTATS_TUNE_INDEX { + LIMCWND, + LIMSSTHRESH, + LIMRWIN, + LIMMSS, + __TUNE_INDEX_MAX +} ESTATS_TUNE_INDEX; +#define TUNE_INDEX_MAX __TUNE_INDEX_MAX + +typedef enum ESTATS_EXTRAS_INDEX { + OTHERREDUCTIONSCV, + OTHERREDUCTIONSCM, + __EXTRAS_INDEX_MAX +} ESTATS_EXTRAS_INDEX; +#define EXTRAS_INDEX_MAX __EXTRAS_INDEX_MAX + +#define TOTAL_NUM_VARS (PERF_INDEX_MAX + \ + PATH_INDEX_MAX + \ + STACK_INDEX_MAX + \ + APP_INDEX_MAX + \ + TUNE_INDEX_MAX + \ + EXTRAS_INDEX_MAX) + +#if BITS_PER_LONG == 64 +#define DEFAULT_PERF_MASK (1UL << PERF_INDEX_MAX)-1 +#define DEFAULT_PATH_MASK (1UL << PATH_INDEX_MAX)-1 +#define DEFAULT_STACK_MASK (1UL << STACK_INDEX_MAX)-1 +#define DEFAULT_APP_MASK (1UL << APP_INDEX_MAX)-1 +#define DEFAULT_TUNE_MASK (1UL << TUNE_INDEX_MAX)-1 +#define DEFAULT_EXTRAS_MASK (1UL << EXTRAS_INDEX_MAX)-1 +#else +#define DEFAULT_PERF_MASK (1ULL << PERF_INDEX_MAX)-1 +#define DEFAULT_PATH_MASK (1ULL << PATH_INDEX_MAX)-1 +#define DEFAULT_STACK_MASK (1ULL << STACK_INDEX_MAX)-1 +#define DEFAULT_APP_MASK (1ULL << APP_INDEX_MAX)-1 +#define DEFAULT_TUNE_MASK (1ULL << TUNE_INDEX_MAX)-1 +#define DEFAULT_EXTRAS_MASK (1ULL << EXTRAS_INDEX_MAX)-1 +#endif + +#else +#endif /* CONFIG_TCP_ESTATS */ + +#endif /* _TCP_ESTATS_MIB_VAR_H_ */ diff --git a/include/net/tcp_estats_nl.h b/include/net/tcp_estats_nl.h new file mode 100644 index 0000000..70f2c84 --- /dev/null +++ b/include/net/tcp_estats_nl.h @@ -0,0 +1,74 @@ +#ifndef _TCP_ESTATS_NL_H_ +#define _TCP_ESTATS_NL_H_ + +enum nl_estats_msg_types { + TCPE_CMD_LIST_CONNS, + TCPE_CMD_READ_ALL, + TCPE_CMD_READ_VARS, + TCPE_CMD_WRITE_VAR, + NLE_MSG_MAX +}; + +enum nl_estats_attr { + NLE_ATTR_UNSPEC, + NLE_ATTR_PERF, + NLE_ATTR_PATH, + NLE_ATTR_STACK, + NLE_ATTR_APP, + NLE_ATTR_TUNE, + NLE_ATTR_EXTRAS, + NLE_ATTR_PERF_MASK, + NLE_ATTR_PATH_MASK, + NLE_ATTR_STACK_MASK, + NLE_ATTR_APP_MASK, + NLE_ATTR_TUNE_MASK, + NLE_ATTR_EXTRAS_MASK, + NLE_ATTR_MASK, + NLE_ATTR_4TUPLE, + NLE_ATTR_WRITE, + NLE_ATTR_TIME, + __NLE_ATTR_MAX +}; +#define NLE_ATTR_MAX (__NLE_ATTR_MAX - 1) + +enum neattr_4tuple { + NEA_UNSPEC, + NEA_REM_ADDR, + NEA_REM_PORT, + NEA_LOCAL_ADDR, + NEA_LOCAL_PORT, + NEA_ADDR_TYPE, + NEA_CID, + __NEA_4TUPLE_MAX +}; +#define NEA_4TUPLE_MAX (__NEA_4TUPLE_MAX - 1) + +enum neattr_mask { + NEA_UNSPEC_MASK, + NEA_PERF_MASK, + NEA_PATH_MASK, + NEA_STACK_MASK, + NEA_APP_MASK, + NEA_TUNE_MASK, + NEA_EXTRAS_MASK, + __NEA_MASK_MAX +}; +#define NEA_MASK_MAX (__NEA_MASK_MAX - 1) + +enum neattr_write { + NEA_UNSPEC_WRITE, + NEA_WRITE_VAR, + NEA_WRITE_VAL, + __NEA_WRITE_MAX +}; +#define NEA_WRITE_MAX (__NEA_WRITE_MAX - 1) + +enum neattr_time { + NEA_UNSPEC_TIME, + NEA_TIME_SEC, + NEA_TIME_USEC, + __NEA_TIME_MAX +}; +#define NEA_TIME_MAX (__NEA_TIME_MAX - 1) + +#endif /* _TCP_ESTATS_NL_H_ */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7944df7..4bd1d1f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -632,3 +632,39 @@ config TCP_MD5SIG on the Internet. If unsure, say N. + +config TCP_ESTATS + bool "TCP: Extended TCP statistics (RFC4898) MIB" + ---help--- + Support for the TCP extended stastics MIB, RFC 4898. + (see http://www.web10g.org) + +if TCP_ESTATS + +config TCP_ESTATS_STRICT_ELAPSEDTIME + bool "TCP: ESTATS strict ElapsedSecs/Msecs counters" + depends on TCP_ESTATS + default n + ---help--- + Elapsed time since beginning of connection. + RFC4898 defines ElapsedSecs/Msecs as being updated at each protocol + event (sending or receiving of a segment); as this can be a + performance hit, leaving this config option off will update elapsed + time on the read instead. + Set to Y for strict conformance with the MIB. + + If unsure, say N. + +endif + +if TCP_ESTATS + +config TCP_ESTATS_NETLINK + tristate "TCP: ESTATS netlink module" + depends on TCP_ESTATS + default m + ---help--- + Netlink module exposing RFC4898 TCP Extended metrics. + See http://www.web10g.org for more details. + +endif diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 15ca63e..bf8fce1 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \ datagram.o raw.o udp.o udplite.o \ arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o \ - inet_fragment.o ping.o + inet_fragment.o ping.o tcp_estats_mib_var.o obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o obj-$(CONFIG_PROC_FS) += proc.o @@ -32,6 +32,8 @@ obj-$(CONFIG_INET_TUNNEL) += tunnel4.o obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o +obj-$(CONFIG_TCP_ESTATS) += tcp_estats.o +obj-$(CONFIG_TCP_ESTATS_NETLINK) += tcp_estats_nl.o obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 960fd29..ca749a5 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -39,6 +39,11 @@ static int ip_ttl_max = 255; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; +/* Extended statistics (RFC4898). */ +#ifdef CONFIG_TCP_ESTATS +int sysctl_tcp_estats __read_mostly; +#endif /* CONFIG_TCP_ESTATS */ + /* Update system visible IP port range */ static void set_local_port_range(int range[2]) { @@ -785,6 +790,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &one }, + { + .procname = "tcp_estats", + .data = &sysctl_tcp_estats, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cdeb839..572b460 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -425,6 +425,10 @@ void tcp_init_sock(struct sock *sk) sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; +#ifdef CONFIG_TCP_ESTATS + tp->tcp_stats = NULL; +#endif + local_bh_disable(); sock_update_memcg(sk); sk_sockets_allocated_inc(sk); @@ -932,6 +936,9 @@ wait_for_sndbuf: wait_for_memory: tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + if (copied) + TCP_ESTATS_UPDATE(tp, tcp_estats_update_writeq(sk)); + if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -1213,8 +1220,10 @@ new_segment: wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: - if (copied) + if (copied) { tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + TCP_ESTATS_UPDATE(tp, tcp_estats_update_writeq(sk)); + } if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -1654,6 +1663,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } + TCP_ESTATS_UPDATE(tp, tcp_estats_update_recvq(sk)); + /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !sk->sk_backlog.tail) @@ -3668,6 +3679,7 @@ void __init tcp_init(void) tcp_metrics_init(); tcp_register_congestion_control(&tcp_reno); + tcp_estats_init(); memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets)); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 019c238..4e20c9a 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -311,13 +311,25 @@ void tcp_slow_start(struct tcp_sock *tp) int cnt; /* increase in packets */ unsigned int delta = 0; u32 snd_cwnd = tp->snd_cwnd; - + u32 limssthresh = 0; +#ifdef CONFIG_TCP_ESTATS + struct tcp_estats *stats = tp->tcp_stats; +#endif + if (unlikely(!snd_cwnd)) { pr_err_once("snd_cwnd is nul, please report this bug.\n"); snd_cwnd = 1U; } - if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) + TCP_ESTATS_VAR_INC(tp, stack_table, SlowStart); + +#ifdef CONFIG_TCP_ESTATS + if (stats && stats->tables.tune_table) + limssthresh = (stats->tables.tune_table)->LimSsthresh; +#endif + if (limssthresh > 0 && tp->snd_cwnd > limssthresh) + cnt = limssthresh >> 1; /* limited slow start */ + else if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */ else cnt = snd_cwnd; /* exponential increase */ @@ -334,6 +346,7 @@ EXPORT_SYMBOL_GPL(tcp_slow_start); /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) { + TCP_ESTATS_VAR_INC(tp, stack_table, CongAvoid); if (tp->snd_cwnd_cnt >= w) { if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; diff --git a/net/ipv4/tcp_estats.c b/net/ipv4/tcp_estats.c new file mode 100644 index 0000000..1140161 --- /dev/null +++ b/net/ipv4/tcp_estats.c @@ -0,0 +1,687 @@ +/* + * net/ipv4/tcp_estats.c + * + * Implementation of TCP ESTATS MIB (RFC 4898) + * + * Authors: + * John Estabrook + * Andrew K. Adams + * John Heffner + * Matt Mathis + * Jeff Semke + * + * The Web10Gig project. See http://www.web10gig.org + * + * Copyright © 2011, Pittsburgh Supercomputing Center (PSC) and + * National Center for Supercomputing Applications (NCSA). + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define ESTATS_INF32 0xffffffff +#define ESTATS_MAX_CID 1024 + +extern int sysctl_tcp_estats; + +struct idr tcp_estats_idr; +EXPORT_SYMBOL(tcp_estats_idr); +static int next_id = 1; +DEFINE_SPINLOCK(tcp_estats_idr_lock); +EXPORT_SYMBOL(tcp_estats_idr_lock); + +int tcp_estats_wq_enabled __read_mostly = 0; +EXPORT_SYMBOL(tcp_estats_wq_enabled); +struct workqueue_struct *tcp_estats_wq = NULL; +EXPORT_SYMBOL(tcp_estats_wq); +void (*create_notify_func)(struct work_struct *work); +EXPORT_SYMBOL(create_notify_func); +void (*establish_notify_func)(struct work_struct *work); +EXPORT_SYMBOL(establish_notify_func); +void (*destroy_notify_func)(struct work_struct *work); +EXPORT_SYMBOL(destroy_notify_func); +unsigned long persist_delay = 0; +EXPORT_SYMBOL(persist_delay); + +struct static_key tcp_estats_enabled = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(tcp_estats_enabled); + +static inline void tcp_estats_enable(void) +{ + static_key_slow_inc(&tcp_estats_enabled); +} + +static inline void tcp_estats_disable(void) +{ + static_key_slow_dec(&tcp_estats_enabled); +} + +/* Calculates the required amount of memory for any enabled tables. */ +int tcp_estats_get_allocation_size(int sysctl) +{ + int size = sizeof(struct tcp_estats) + + sizeof(struct tcp_estats_connection_table); + + if (sysctl & TCP_ESTATS_TABLEMASK_PERF) + size += sizeof(struct tcp_estats_perf_table); + if (sysctl & TCP_ESTATS_TABLEMASK_PATH) + size += sizeof(struct tcp_estats_path_table); + if (sysctl & TCP_ESTATS_TABLEMASK_STACK) + size += sizeof(struct tcp_estats_stack_table); + if (sysctl & TCP_ESTATS_TABLEMASK_APP) + size += sizeof(struct tcp_estats_app_table); + if (sysctl & TCP_ESTATS_TABLEMASK_TUNE) + size += sizeof(struct tcp_estats_tune_table); + if (sysctl & TCP_ESTATS_TABLEMASK_EXTRAS) + size += sizeof(struct tcp_estats_extras_table); + return size; +} + +/* Called whenever a TCP/IPv4 sock is created. + * net/ipv4/tcp_ipv4.c: tcp_v4_syn_recv_sock, + * tcp_v4_init_sock + * Allocates a stats structure and initializes values. + */ +int tcp_estats_create(struct sock *sk, enum tcp_estats_addrtype addrtype, + int active) +{ + struct tcp_estats *stats; + struct tcp_estats_tables *tables; + struct tcp_sock *tp = tcp_sk(sk); + void *estats_mem; + int sysctl; + int ret; + + /* Read the sysctl once before calculating memory needs and initializing + * tables to avoid raciness. */ + sysctl = ACCESS_ONCE(sysctl_tcp_estats); + if (likely(sysctl == TCP_ESTATS_TABLEMASK_INACTIVE)) { + return 0; + } + + estats_mem = kzalloc(tcp_estats_get_allocation_size(sysctl), gfp_any()); + if (!estats_mem) + return -ENOMEM; + + + stats = estats_mem; + estats_mem += sizeof(struct tcp_estats); + + tables = &stats->tables; + + tables->connection_table = estats_mem; + estats_mem += sizeof(struct tcp_estats_connection_table); + + if (sysctl & TCP_ESTATS_TABLEMASK_PERF) { + tables->perf_table = estats_mem; + estats_mem += sizeof(struct tcp_estats_perf_table); + } + if (sysctl & TCP_ESTATS_TABLEMASK_PATH) { + tables->path_table = estats_mem; + estats_mem += sizeof(struct tcp_estats_path_table); + } + if (sysctl & TCP_ESTATS_TABLEMASK_STACK) { + tables->stack_table = estats_mem; + estats_mem += sizeof(struct tcp_estats_stack_table); + } + if (sysctl & TCP_ESTATS_TABLEMASK_APP) { + tables->app_table = estats_mem; + estats_mem += sizeof(struct tcp_estats_app_table); + } + if (sysctl & TCP_ESTATS_TABLEMASK_TUNE) { + tables->tune_table = estats_mem; + estats_mem += sizeof(struct tcp_estats_tune_table); + } + if (sysctl & TCP_ESTATS_TABLEMASK_EXTRAS) { + tables->extras_table = estats_mem; + estats_mem += sizeof(struct tcp_estats_extras_table); + } + + stats->tcpe_cid = -1; + stats->queued = 0; + + tables->connection_table->AddressType = addrtype; + + sock_hold(sk); + stats->sk = sk; + atomic_set(&stats->users, 0); + + stats->limstate = TCP_ESTATS_SNDLIM_STARTUP; + stats->start_ts = stats->limstate_ts = stats->current_ts = ktime_get(); + do_gettimeofday(&stats->start_tv); + + TCP_ESTATS_VAR_SET(tp, stack_table, ActiveOpen, active); + TCP_ESTATS_VAR_SET(tp, app_table, SndMax, tp->snd_nxt); + TCP_ESTATS_VAR_SET(tp, stack_table, SndInitial, tp->snd_nxt); + TCP_ESTATS_VAR_SET(tp, tune_table, LimSsthresh, + sysctl_tcp_max_ssthresh); + + TCP_ESTATS_VAR_SET(tp, path_table, MinRTT, ESTATS_INF32); + TCP_ESTATS_VAR_SET(tp, path_table, MinRTO, ESTATS_INF32); + TCP_ESTATS_VAR_SET(tp, stack_table, MinMSS, ESTATS_INF32); + TCP_ESTATS_VAR_SET(tp, stack_table, MinSsthresh, ESTATS_INF32); + + tp->tcp_stats = stats; + tcp_estats_use(stats); + + if (tcp_estats_wq_enabled) { + tcp_estats_use(stats); + stats->queued = 1; + stats->tcpe_cid = 0; + INIT_WORK(&stats->create_notify, create_notify_func); + ret = queue_work(tcp_estats_wq, &stats->create_notify); + } + + tcp_estats_enable(); + + return 0; +} +EXPORT_SYMBOL(tcp_estats_create); + +void tcp_estats_destroy(struct sock *sk) +{ + struct tcp_estats *stats = tcp_sk(sk)->tcp_stats; + + if (stats == NULL) + return; + + /* Attribute final sndlim time. */ + tcp_estats_update_sndlim(tcp_sk(stats->sk), stats->limstate); + + if (tcp_estats_wq_enabled && stats->queued) { + INIT_DELAYED_WORK(&stats->destroy_notify, + destroy_notify_func); + queue_delayed_work(tcp_estats_wq, &stats->destroy_notify, + persist_delay); + + } + tcp_estats_unuse(stats); +} + +/* Do not call directly. Called from tcp_estats_unuse(). */ +void tcp_estats_free(struct tcp_estats *stats) +{ + tcp_estats_disable(); + sock_put(stats->sk); + kfree(stats); +} +EXPORT_SYMBOL(tcp_estats_free); + +/* Called when a connection enters the ESTABLISHED state, and has all its + * state initialized. + * net/ipv4/tcp_input.c: tcp_rcv_state_process, + * tcp_rcv_synsent_state_process + * Here we link the statistics structure in so it is visible in the /proc + * fs, and do some final init. + */ +void tcp_estats_establish(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_estats *stats = tp->tcp_stats; + struct tcp_estats_connection_table *conn_table; + + if (stats == NULL) + return; + + conn_table = stats->tables.connection_table; + + /* Let's set these here, since they can't change once the + * connection is established. + */ + conn_table->LocalPort = inet->inet_num; + conn_table->RemPort = ntohs(inet->inet_dport); + + if (conn_table->AddressType == TCP_ESTATS_ADDRTYPE_IPV4) { + memcpy(&conn_table->LocalAddress, &inet->inet_rcv_saddr, 4); + memcpy(&conn_table->RemAddress, &inet->inet_daddr, 4); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (conn_table->AddressType == TCP_ESTATS_ADDRTYPE_IPV6) { + memcpy(&conn_table->LocalAddress, &(inet6_sk(sk)->saddr), 16); + memcpy(&conn_table->RemAddress, &(inet6_sk(sk)->daddr), 16); + } +#endif + else { + pr_err("TCP ESTATS: AddressType not valid.\n"); + } + + tcp_estats_update_finish_segrecv(tp); + tcp_estats_update_rwin_rcvd(tp); + tcp_estats_update_rwin_sent(tp); + + TCP_ESTATS_VAR_SET(tp, stack_table, RecInitial, tp->rcv_nxt); + + tcp_estats_update_sndlim(tp, TCP_ESTATS_SNDLIM_SENDER); + + if (tcp_estats_wq_enabled && stats->queued) { + INIT_WORK(&stats->establish_notify, establish_notify_func); + queue_work(tcp_estats_wq, &stats->establish_notify); + } +} + +/* + * Statistics update functions + */ + +void tcp_estats_update_snd_nxt(struct tcp_sock *tp) +{ + struct tcp_estats *stats = tp->tcp_stats; + + if (stats->tables.app_table) { + if (after(tp->snd_nxt, stats->tables.app_table->SndMax)) + stats->tables.app_table->SndMax = tp->snd_nxt; + } +} + +void tcp_estats_update_acked(struct tcp_sock *tp, u32 ack) +{ + struct tcp_estats *stats = tp->tcp_stats; + + if (stats->tables.app_table) + stats->tables.app_table->ThruOctetsAcked += ack - tp->snd_una; +} + +void tcp_estats_update_rtt(struct sock *sk, unsigned long rtt_sample) +{ + struct tcp_estats *stats = tcp_sk(sk)->tcp_stats; + struct tcp_estats_path_table *path_table = stats->tables.path_table; + unsigned long rtt_sample_msec = rtt_sample * 1000 / HZ; + u32 rto; + + if (path_table == NULL) + return; + + path_table->SampleRTT = rtt_sample_msec; + + if (rtt_sample_msec > path_table->MaxRTT) + path_table->MaxRTT = rtt_sample_msec; + if (rtt_sample_msec < path_table->MinRTT) + path_table->MinRTT = rtt_sample_msec; + + path_table->CountRTT++; + path_table->SumRTT += rtt_sample_msec; + + rto = inet_csk(sk)->icsk_rto * 1000 / HZ; + if (rto > path_table->MaxRTO) + path_table->MaxRTO = rto; + if (rto < path_table->MinRTO) + path_table->MinRTO = rto; +} + +void tcp_estats_update_timeout(struct sock *sk) +{ + if (inet_csk(sk)->icsk_backoff) + TCP_ESTATS_VAR_INC(tcp_sk(sk), stack_table, SubsequentTimeouts); + else + TCP_ESTATS_VAR_INC(tcp_sk(sk), perf_table, Timeouts); + + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open) + TCP_ESTATS_VAR_INC(tcp_sk(sk), stack_table, AbruptTimeouts); +} + +void tcp_estats_update_mss(struct tcp_sock *tp) +{ + struct tcp_estats *stats = tp->tcp_stats; + struct tcp_estats_stack_table *stack_table = stats->tables.stack_table; + int mss = tp->mss_cache; + + if (stack_table == NULL) + return; + + if (mss > stack_table->MaxMSS) + stack_table->MaxMSS = mss; + if (mss < stack_table->MinMSS) + stack_table->MinMSS = mss; +} + +void tcp_estats_update_finish_segrecv(struct tcp_sock *tp) +{ + struct tcp_estats *stats = tp->tcp_stats; + struct tcp_estats_tables *tables = &stats->tables; + struct tcp_estats_perf_table *perf_table = tables->perf_table; + struct tcp_estats_stack_table *stack_table = tables->stack_table; + u32 mss = tp->mss_cache; + u32 cwnd; + u32 ssthresh; + u32 pipe_size; + +#ifdef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME + stats->current_ts = ktime_get(); +#endif + + if (stack_table != NULL) { + cwnd = tp->snd_cwnd * mss; + if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (cwnd > stack_table->MaxSsCwnd) + stack_table->MaxSsCwnd = cwnd; + } else if (cwnd > stack_table->MaxCaCwnd) { + stack_table->MaxCaCwnd = cwnd; + } + } + + if (perf_table != NULL) { + pipe_size = tcp_packets_in_flight(tp) * mss; + if (pipe_size > perf_table->MaxPipeSize) + perf_table->MaxPipeSize = pipe_size; + } + + /* Discard initiail ssthresh set at infinity. */ + if (tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH) { + return; + } + + if (stack_table != NULL) { + ssthresh = tp->snd_ssthresh * tp->mss_cache; + if (ssthresh > stack_table->MaxSsthresh) + stack_table->MaxSsthresh = ssthresh; + if (ssthresh < stack_table->MinSsthresh) + stack_table->MinSsthresh = ssthresh; + } +} +EXPORT_SYMBOL(tcp_estats_update_finish_segrecv); + +void tcp_estats_update_rwin_rcvd(struct tcp_sock *tp) +{ + struct tcp_estats *stats = tp->tcp_stats; + struct tcp_estats_perf_table *perf_table = stats->tables.perf_table; + u32 win = tp->snd_wnd; + + if (perf_table == NULL) + return; + + if (win > perf_table->MaxRwinRcvd) + perf_table->MaxRwinRcvd = win; + if (win == 0) + perf_table->ZeroRwinRcvd++; +} + +void tcp_estats_update_rwin_sent(struct tcp_sock *tp) +{ + struct tcp_estats *stats = tp->tcp_stats; + struct tcp_estats_perf_table *perf_table = stats->tables.perf_table; + u32 win = tp->rcv_wnd; + + if (perf_table == NULL) + return; + + if (win > perf_table->MaxRwinSent) + perf_table->MaxRwinSent = win; + if (win == 0) + perf_table->ZeroRwinSent++; +} + +void tcp_estats_update_sndlim(struct tcp_sock *tp, + enum tcp_estats_sndlim_states state) +{ + struct tcp_estats *stats = tp->tcp_stats; + struct tcp_estats_perf_table *perf_table = stats->tables.perf_table; + ktime_t now; + + if (state <= TCP_ESTATS_SNDLIM_NONE || + state >= TCP_ESTATS_SNDLIM_NSTATES) { + pr_err("tcp_estats_update_sndlim: BUG: state out of range %d\n", + state); + return; + } + + if (perf_table == NULL) + return; + + now = ktime_get(); + perf_table->snd_lim_time[stats->limstate] + += ktime_to_us(ktime_sub(now, stats->limstate_ts)); + stats->limstate_ts = now; + if (stats->limstate != state) { + stats->limstate = state; + perf_table->snd_lim_trans[state]++; + } +} + +void tcp_estats_update_congestion(struct tcp_sock *tp) +{ + struct tcp_estats *stats = tp->tcp_stats; + struct tcp_estats_path_table *path_table = stats->tables.path_table; + + TCP_ESTATS_VAR_INC(tp, perf_table, CongSignals); + + if (path_table != NULL) { + path_table->PreCongSumCwnd += tp->snd_cwnd * tp->mss_cache; + path_table->PreCongSumRTT += path_table->SampleRTT; + } +} + +void tcp_estats_update_post_congestion(struct tcp_sock *tp) +{ + struct tcp_estats *stats = tp->tcp_stats; + struct tcp_estats_path_table *path_table = stats->tables.path_table; + + if (path_table != NULL) { + path_table->PostCongCountRTT++; + path_table->PostCongSumRTT += path_table->SampleRTT; + } +} + +void tcp_estats_update_segsend(struct sock *sk, int len, int pcount, + u32 seq, u32 end_seq, int flags) +{ + struct tcp_estats *stats = tcp_sk(sk)->tcp_stats; + struct tcp_estats_perf_table *perf_table = stats->tables.perf_table; + struct tcp_estats_app_table *app_table = stats->tables.app_table; + +#ifdef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME + stats->current_ts = ktime_get(); +#endif + + if (perf_table == NULL) + return; + + /* We know we're sending a segment. */ + perf_table->SegsOut += pcount; + + /* A pure ACK contains no data; everything else is data. */ + if (len > 0) { + perf_table->DataSegsOut += pcount; + perf_table->DataOctetsOut += len; + } + + /* Check for retransmission. */ + if (flags & TCPHDR_SYN) { + if (inet_csk(sk)->icsk_retransmits) + perf_table->SegsRetrans++; + } else if (app_table != NULL && + before(seq, app_table->SndMax)) { + perf_table->SegsRetrans += pcount; + perf_table->OctetsRetrans += end_seq - seq; + } +} + +void tcp_estats_update_segrecv(struct tcp_sock *tp, struct sk_buff *skb) +{ + struct tcp_estats_tables *tables = &tp->tcp_stats->tables; + struct tcp_estats_path_table *path_table = tables->path_table; + struct tcp_estats_perf_table *perf_table = tables->perf_table; + struct tcp_estats_stack_table *stack_table = tables->stack_table; + struct tcphdr *th = tcp_hdr(skb); + struct iphdr *iph = ip_hdr(skb); + + if (perf_table != NULL) + perf_table->SegsIn++; + + if (skb->len == th->doff * 4) { + if (stack_table != NULL && + TCP_SKB_CB(skb)->ack_seq == tp->snd_una) + stack_table->DupAcksIn++; + } else { + if (perf_table != NULL) { + perf_table->DataSegsIn++; + perf_table->DataOctetsIn += skb->len - th->doff * 4; + } + } + + if (path_table != NULL) { + path_table->IpTtl = iph->ttl; + path_table->IpTosIn = iph->tos; + } +} +EXPORT_SYMBOL(tcp_estats_update_segrecv); + +void tcp_estats_update_rcvd(struct tcp_sock *tp, u32 seq) +{ + TCP_ESTATS_VAR_ADD(tp, app_table, ThruOctetsReceived, + seq - tp->rcv_nxt); +} + +void tcp_estats_update_writeq(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_estats_app_table *app_table = + tp->tcp_stats->tables.app_table; + int len; + + if (app_table == NULL) + return; + + len = tp->write_seq - app_table->SndMax; + + if (len > app_table->MaxAppWQueue) + app_table->MaxAppWQueue = len; +} + +static inline u32 ofo_qlen(struct tcp_sock *tp) +{ + if (!skb_peek(&tp->out_of_order_queue)) + return 0; + else + return TCP_SKB_CB(tp->out_of_order_queue.prev)->end_seq - + TCP_SKB_CB(tp->out_of_order_queue.next)->seq; +} + +void tcp_estats_update_recvq(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_estats_tables *tables = &tp->tcp_stats->tables; + struct tcp_estats_app_table *app_table = tables->app_table; + struct tcp_estats_stack_table *stack_table = tables->stack_table; + u32 len1 = tp->rcv_nxt - tp->copied_seq; + u32 len2 = ofo_qlen(tp); + + if (app_table != NULL) { + if (app_table->MaxAppRQueue < len1) + app_table->MaxAppRQueue = len1; + } + + if (stack_table != NULL) { + if (stack_table->MaxReasmQueue < len2) + stack_table->MaxReasmQueue = len2; + } +} + +/* + * Manage connection ID table + */ + +static int get_new_cid(struct tcp_estats *stats) +{ + int err; + int id_cid; + +again: + if (unlikely(idr_pre_get(&tcp_estats_idr, GFP_KERNEL) == 0)) + return -ENOMEM; + + spin_lock_bh(&tcp_estats_idr_lock); + err = idr_get_new_above(&tcp_estats_idr, stats, next_id, &id_cid); + if (!err) { + next_id = (id_cid + 1) % ESTATS_MAX_CID; + stats->tcpe_cid = id_cid; + } + spin_unlock_bh(&tcp_estats_idr_lock); + + if (unlikely(err == -EAGAIN)) + goto again; + else if (unlikely(err)) + return err; + + return 0; +} + +static void create_func(struct work_struct *work) +{ + // stub for netlink notification of new connections + ; +} + +static void establish_func(struct work_struct *work) +{ + struct tcp_estats *stats = container_of(work, struct tcp_estats, + establish_notify); + int err = 0; + + if ((stats->tcpe_cid) > 0) { + pr_err("TCP estats container established multiple times.\n"); + return; + } + + if ((stats->tcpe_cid) == 0) { + err = get_new_cid(stats); + if (err) + pr_devel("get_new_cid error %d\n", err); + } +} + +static void destroy_func(struct work_struct *work) +{ + struct tcp_estats *stats = container_of(work, struct tcp_estats, + destroy_notify.work); + + int id_cid = stats->tcpe_cid; + + if (id_cid == 0) + pr_devel("TCP estats destroyed before being established.\n"); + + if (id_cid >= 0) { + if (id_cid) { + spin_lock_bh(&tcp_estats_idr_lock); + idr_remove(&tcp_estats_idr, id_cid); + spin_unlock_bh(&tcp_estats_idr_lock); + } + stats->tcpe_cid = -1; + + tcp_estats_unuse(stats); + } +} + +void __init tcp_estats_init() +{ + idr_init(&tcp_estats_idr); + + create_notify_func = &create_func; + establish_notify_func = &establish_func; + destroy_notify_func = &destroy_func; + + persist_delay = 5 * HZ; + + tcp_estats_wq = alloc_workqueue("tcp_estats", WQ_MEM_RECLAIM, 256); + if (tcp_estats_wq == NULL) { + pr_err("tcp_estats_init(): alloc_workqueue failed\n"); + goto cleanup_fail; + } + + tcp_estats_wq_enabled = 1; + return; + +cleanup_fail: + pr_err("TCP ESTATS: initialization failed.\n"); +} diff --git a/net/ipv4/tcp_estats_mib_var.c b/net/ipv4/tcp_estats_mib_var.c new file mode 100644 index 0000000..5d1b409 --- /dev/null +++ b/net/ipv4/tcp_estats_mib_var.c @@ -0,0 +1,623 @@ +#include +#include + +#ifdef CONFIG_TCP_ESTATS + +#define OFFSET_TP(field) ((unsigned long)(&(((struct tcp_sock *)NULL)->field))) + +static char *get_stats_base(struct tcp_estats *stats, + struct tcp_estats_var *vp) { + char* base = NULL; + + if (strcmp(vp->table, "perf_table") == 0) + base = (char *) stats->tables.perf_table; + else if (strcmp(vp->table, "path_table") == 0) + base = (char *) stats->tables.path_table; + else if (strcmp(vp->table, "stack_table") == 0) + base = (char *) stats->tables.stack_table; + else if (strcmp(vp->table, "app_table") == 0) + base = (char *) stats->tables.app_table; + else if (strcmp(vp->table, "tune_table") == 0) + base = (char *) stats->tables.tune_table; + else if (strcmp(vp->table, "extras_table") == 0) + base = (char *) stats->tables.extras_table; + + return base; +}; + +static void read_stats(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + char *base = get_stats_base(stats, vp); + if (base != NULL) + memcpy(buf, base + vp->read_data, tcp_estats_var_len(vp)); +} + +static void read_sk32(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + memcpy(buf, (char *)(stats->sk) + vp->read_data, 4); +} + +static void read_inf32(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + u64 val; + char *base = get_stats_base(stats, vp); + if (base != NULL) { + memcpy(&val, base + vp->read_data, 8); + val &= 0xffffffff; + memcpy(buf, &val, 4); + } +} + +static void read_ElapsedSecs(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + ktime_t elapsed; + u32 secs; + +#ifndef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME + stats->current_ts = ktime_get(); +#endif + elapsed = ktime_sub(stats->current_ts, stats->start_ts); + secs = ktime_to_timeval(elapsed).tv_sec; + + memcpy(buf, &secs, 4); +} + +static void read_ElapsedMicroSecs(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + ktime_t elapsed; + u32 usecs; + +#ifndef CONFIG_TCP_ESTATS_STRICT_ELAPSEDTIME + stats->current_ts = ktime_get(); +#endif + elapsed = ktime_sub(stats->current_ts, stats->start_ts); + usecs = ktime_to_timeval(elapsed).tv_usec; + + memcpy(buf, &usecs, 4); +} + +static void read_StartTimeStamp(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + u8 val = 0; // currently unimplemented + memcpy(buf, &val, 1); +} + +static void read_PipeSize(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + u32 val = tcp_packets_in_flight(tp) * tp->mss_cache; + memcpy(buf, &val, 4); +} + +static void read_SmoothedRTT(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + u32 val = (tp->srtt >> 3) * 1000 / HZ; + memcpy(buf, &val, 4); +} + +static void read_CurRTO(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct inet_connection_sock *icsk = inet_csk(stats->sk); + u32 val = icsk->icsk_rto * 1000 / HZ; + memcpy(buf, &val, 4); +} + +static void read_CurCwnd(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + u32 val = tp->snd_cwnd * tp->mss_cache; + memcpy(buf, &val, 4); +} + +static void read_CurSsthresh(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + u32 val = tp->snd_ssthresh <= 0x7fffffff ? + tp->snd_ssthresh * tp->mss_cache : 0xffffffff; + memcpy(buf, &val, 4); +} + +static void read_RetranThresh(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + u32 val = tp->reordering; + memcpy(buf, &val, 4); +} + +static void read_RTTVar(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + u32 val = (tp->rttvar >> 2) * 1000 / HZ; + memcpy(buf, &val, 4); +} + +/* Note: this value returned is technically incorrect between a + * setsockopt of IP_TOS, and when the next segment is sent. */ +static void read_IpTosOut(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct inet_sock *inet = inet_sk(stats->sk); + *(char *)buf = inet->tos; +} + +static void read_RcvRTT(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + u32 val = ((1000000*tp->rcv_rtt_est.rtt)/HZ)>>3; + memcpy(buf, &val, 4); +} + +static void read_MSSSent(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + u32 val = tp->advmss; + memcpy(buf, &val, 4); +} + +static void read_MSSRcvd(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + u32 val = tp->rx_opt.rec_mss; + memcpy(buf, &val, 4); +} + +/* Note: WinScaleSent and WinScaleRcvd are incorrectly + * implemented for the case where we sent a scale option + * but did not receive one. */ +static void read_WinScaleSent(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + + s32 val = tp->rx_opt.wscale_ok ? tp->rx_opt.rcv_wscale : -1; + memcpy(buf, &val, 4); +} + +static void read_WinScaleRcvd(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + + s32 val = tp->rx_opt.wscale_ok ? tp->rx_opt.snd_wscale : -1; + memcpy(buf, &val, 4); +} + +/* Note: all these (TimeStamps, ECN, SACK, Nagle) are incorrect + * if the sysctl values are changed during the connection. */ +static void read_TimeStamps(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + s32 val = 1; + + if (!tp->rx_opt.tstamp_ok) + val = sysctl_tcp_timestamps ? 3 : 2; + memcpy(buf, &val, 4); +} + +static void read_ECN(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct sock *sk = stats->sk; + struct tcp_sock *tp = tcp_sk(sk); + s32 val = 1; + + if ((tp->ecn_flags & TCP_ECN_OK) == 0) + val = sock_net(sk)->ipv4.sysctl_tcp_ecn ? 3 : 2; + memcpy(buf, &val, 4); +} + +static void read_WillSendSACK(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + s32 val = 1; + + if (!tp->rx_opt.sack_ok) + val = sysctl_tcp_sack ? 3 : 2; + + memcpy(buf, &val, 4); +} + +#define read_WillUseSACK read_WillSendSACK + +static void read_State(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + /* A mapping from Linux to MIB state. */ + static char state_map[] = { 0, + TCP_ESTATS_STATE_ESTABLISHED, + TCP_ESTATS_STATE_SYNSENT, + TCP_ESTATS_STATE_SYNRECEIVED, + TCP_ESTATS_STATE_FINWAIT1, + TCP_ESTATS_STATE_FINWAIT2, + TCP_ESTATS_STATE_TIMEWAIT, + TCP_ESTATS_STATE_CLOSED, + TCP_ESTATS_STATE_CLOSEWAIT, + TCP_ESTATS_STATE_LASTACK, + TCP_ESTATS_STATE_LISTEN, + TCP_ESTATS_STATE_CLOSING }; + s32 val = state_map[stats->sk->sk_state]; + memcpy(buf, &val, 4); +} + +static void read_Nagle(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + + s32 val = tp->nonagle ? 2 : 1; + memcpy(buf, &val, 4); +} + +static void read_InRecovery(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct inet_connection_sock *icsk = inet_csk(stats->sk); + + s32 val = icsk->icsk_ca_state > TCP_CA_CWR ? 1 : 2; + memcpy(buf, &val, 4); +} + +static void read_CurTimeoutCount(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct inet_connection_sock *icsk = inet_csk(stats->sk); + + u32 val = icsk->icsk_retransmits; + memcpy(buf, &val, 4); +} + +static inline u32 ofo_qlen(struct tcp_sock *tp) +{ + if (!skb_peek(&tp->out_of_order_queue)) + return 0; + else + return TCP_SKB_CB(tp->out_of_order_queue.prev)->end_seq - + TCP_SKB_CB(tp->out_of_order_queue.next)->seq; +} + +static void read_CurReasmQueue(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + + u32 val = ofo_qlen(tp); + memcpy(buf, &val, 4); +} + +static void read_CurAppWQueue(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + struct tcp_estats_app_table *app_table = + tp->tcp_stats->tables.app_table; + u32 val; + + if (app_table == NULL) + return; + val = tp->write_seq - app_table->SndMax; + memcpy(buf, &val, 4); +} + +static void read_CurAppRQueue(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + + u32 val = tp->rcv_nxt - tp->copied_seq; + memcpy(buf, &val, 4); +} + +static void read_LimCwnd(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + + u32 tmp = (u32) (tp->snd_cwnd_clamp * tp->mss_cache); + memcpy(buf, &tmp, 4); +} + +static void read_LimSsthresh(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_estats_tune_table *tune_table = stats->tables.tune_table; + + if (tune_table) + memcpy(buf, &tune_table->LimSsthresh, 4); +} + +static void write_LimSsthresh(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_estats_tune_table *tune_table = stats->tables.tune_table; + u32 tmp = *(u32 *) buf; + + if (tune_table) + memcpy(&tune_table->LimSsthresh, &tmp, 4); +} + +static void write_LimCwnd(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + + tp->snd_cwnd_clamp = min(*(u32 *) buf / tp->mss_cache, 65535U); +} + +static void read_LimRwin(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + memcpy(buf, (char *)(stats->sk) + OFFSET_TP(window_clamp), 4); +} + +static void write_LimRwin(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + struct tcp_sock *tp = tcp_sk(stats->sk); + u32 val; + + memcpy(&val, buf, 4); + tp->window_clamp = min(val, 65535U << tp->rx_opt.rcv_wscale); +} + +static void read_LimMSS(void *buf, struct tcp_estats *stats, + struct tcp_estats_var *vp) +{ + memcpy(buf, (char *)(stats->sk) + OFFSET_TP(rx_opt.mss_clamp), 4); +} + +#define OFFSET_ST(field, table) \ + ((unsigned long)(&(((struct tcp_estats_##table *)NULL)->field))) + +#define ESTATSVAR(__name, __type, __table) { \ + .name = #__name, \ + .type = TCP_ESTATS_##__type, \ + .table = #__table, \ + .read = read_stats, \ + .read_data = OFFSET_ST(__name, __table), \ + .write = NULL } +#define ESTATSVARN(__name, __type, __var, __table) { \ + .name = #__name, \ + .type = TCP_ESTATS_##__type, \ + .table = #__table, \ + .read = read_stats, \ + .read_data = OFFSET_ST(__var, __table), \ + .write = NULL } +#define TPVAR32(__name, __type, __var) { \ + .name = #__name, \ + .type = TCP_ESTATS_##__type, \ + .read = read_sk32, \ + .read_data = OFFSET_TP(__var), \ + .write = NULL } +#define HCINF32(__name, __type, __table) { \ + .name = #__name, \ + .type = TCP_ESTATS_##__type, \ + .table = #__table, \ + .read = read_inf32, \ + .read_data = OFFSET_ST(__name, __table), \ + .write = NULL } +#define READFUNC(__name, __type) { \ + .name = #__name, \ + .type = TCP_ESTATS_##__type, \ + .read = read_##__name, \ + .write = NULL } +#define RWFUNC(__name, __type) { \ + .name = #__name, \ + .type = TCP_ESTATS_##__type, \ + .read = read_##__name, \ + .write = write_##__name } + +int max_index[MAX_TABLE] = { PERF_INDEX_MAX, PATH_INDEX_MAX, STACK_INDEX_MAX, + APP_INDEX_MAX, TUNE_INDEX_MAX, EXTRAS_INDEX_MAX }; +EXPORT_SYMBOL(max_index); + +struct tcp_estats_var perf_var_array[] = { + ESTATSVAR(SegsOut,UNSIGNED32, perf_table), + ESTATSVAR(DataSegsOut,UNSIGNED32, perf_table), + HCINF32(DataOctetsOut,UNSIGNED32, perf_table), + ESTATSVARN(HCDataOctetsOut,UNSIGNED64, DataOctetsOut, perf_table), + ESTATSVAR(SegsRetrans,UNSIGNED32, perf_table), + ESTATSVAR(OctetsRetrans,UNSIGNED32, perf_table), + ESTATSVAR(SegsIn,UNSIGNED32, perf_table), + ESTATSVAR(DataSegsIn,UNSIGNED32, perf_table), + HCINF32(DataOctetsIn,UNSIGNED32, perf_table), + ESTATSVARN(HCDataOctetsIn,UNSIGNED64, DataOctetsIn, perf_table), + READFUNC(ElapsedSecs,UNSIGNED32), + READFUNC(ElapsedMicroSecs,UNSIGNED32), + READFUNC(StartTimeStamp,UNSIGNED8), + TPVAR32(CurMSS,UNSIGNED32, mss_cache), + READFUNC(PipeSize,UNSIGNED32), + ESTATSVAR(MaxPipeSize,UNSIGNED32, perf_table), + READFUNC(SmoothedRTT,UNSIGNED32), + READFUNC(CurRTO,UNSIGNED32), + ESTATSVAR(CongSignals,UNSIGNED32, perf_table), + READFUNC(CurCwnd,UNSIGNED32), + READFUNC(CurSsthresh,UNSIGNED32), + ESTATSVAR(Timeouts,UNSIGNED32, perf_table), + TPVAR32(CurRwinSent,UNSIGNED32, rcv_wnd), + ESTATSVAR(MaxRwinSent,UNSIGNED32, perf_table), + ESTATSVAR(ZeroRwinSent,UNSIGNED32, perf_table), + TPVAR32(CurRwinRcvd,UNSIGNED32, snd_wnd), + ESTATSVAR(MaxRwinRcvd,UNSIGNED32, perf_table), + ESTATSVAR(ZeroRwinRcvd,UNSIGNED32, perf_table), + ESTATSVARN(SndLimTransRwin,UNSIGNED32, + snd_lim_trans[TCP_ESTATS_SNDLIM_RWIN], perf_table), + ESTATSVARN(SndLimTransCwnd,UNSIGNED32, + snd_lim_trans[TCP_ESTATS_SNDLIM_CWND], perf_table), + ESTATSVARN(SndLimTransSnd,UNSIGNED32, + snd_lim_trans[TCP_ESTATS_SNDLIM_SENDER], perf_table), + ESTATSVARN(SndLimTransTSODefer,UNSIGNED32, + snd_lim_trans[TCP_ESTATS_SNDLIM_TSODEFER], perf_table), + ESTATSVARN(SndLimTimeRwin,UNSIGNED32, + snd_lim_time[TCP_ESTATS_SNDLIM_RWIN], perf_table), + ESTATSVARN(SndLimTimeCwnd,UNSIGNED32, + snd_lim_time[TCP_ESTATS_SNDLIM_CWND], perf_table), + ESTATSVARN(SndLimTimeSnd,UNSIGNED32, + snd_lim_time[TCP_ESTATS_SNDLIM_SENDER], perf_table), + ESTATSVARN(SndLimTimeTSODefer,UNSIGNED32, + snd_lim_time[TCP_ESTATS_SNDLIM_TSODEFER], perf_table), +}; + +struct tcp_estats_var path_var_array[] = { + READFUNC(RetranThresh,UNSIGNED32), + ESTATSVAR(NonRecovDAEpisodes,UNSIGNED32, path_table), + ESTATSVAR(SumOctetsReordered,UNSIGNED32, path_table), + ESTATSVAR(NonRecovDA,UNSIGNED32, path_table), + ESTATSVAR(SampleRTT,UNSIGNED32, path_table), + READFUNC(RTTVar,UNSIGNED32), + ESTATSVAR(MaxRTT,UNSIGNED32, path_table), + ESTATSVAR(MinRTT,UNSIGNED32, path_table), + HCINF32(SumRTT,UNSIGNED32, path_table), + ESTATSVARN(HCSumRTT,UNSIGNED64, SumRTT, path_table), + ESTATSVAR(CountRTT,UNSIGNED32, path_table), + ESTATSVAR(MaxRTO,UNSIGNED32, path_table), + ESTATSVAR(MinRTO,UNSIGNED32, path_table), + ESTATSVAR(IpTtl,UNSIGNED32, path_table), + ESTATSVAR(IpTosIn,UNSIGNED8, path_table), + READFUNC(IpTosOut,UNSIGNED8), + ESTATSVAR(PreCongSumCwnd,UNSIGNED32, path_table), + ESTATSVAR(PreCongSumRTT,UNSIGNED32, path_table), + ESTATSVAR(PostCongSumRTT,UNSIGNED32, path_table), + ESTATSVAR(PostCongCountRTT,UNSIGNED32, path_table), + ESTATSVAR(ECNsignals,UNSIGNED32, path_table), + ESTATSVAR(DupAckEpisodes,UNSIGNED32, path_table), + READFUNC(RcvRTT,UNSIGNED32), + ESTATSVAR(DupAcksOut,UNSIGNED32, path_table), + ESTATSVAR(CERcvd,UNSIGNED32, path_table), + ESTATSVAR(ECESent,UNSIGNED32, path_table), +}; + +struct tcp_estats_var stack_var_array[] = { + ESTATSVAR(ActiveOpen,SIGNED32, stack_table), + READFUNC(MSSSent,UNSIGNED32), + READFUNC(MSSRcvd,UNSIGNED32), + READFUNC(WinScaleSent,SIGNED32), + READFUNC(WinScaleRcvd,SIGNED32), + READFUNC(TimeStamps,SIGNED32), + READFUNC(ECN,SIGNED32), + READFUNC(WillSendSACK,SIGNED32), + READFUNC(WillUseSACK,SIGNED32), + READFUNC(State,SIGNED32), + READFUNC(Nagle,SIGNED32), + ESTATSVAR(MaxSsCwnd,UNSIGNED32, stack_table), + ESTATSVAR(MaxCaCwnd,UNSIGNED32, stack_table), + ESTATSVAR(MaxSsthresh,UNSIGNED32, stack_table), + ESTATSVAR(MinSsthresh,UNSIGNED32, stack_table), + READFUNC(InRecovery,SIGNED32), + ESTATSVAR(DupAcksIn,UNSIGNED32, stack_table), + ESTATSVAR(SpuriousFrDetected,UNSIGNED32, stack_table), + ESTATSVAR(SpuriousRtoDetected,UNSIGNED32, stack_table), + ESTATSVAR(SoftErrors,UNSIGNED32, stack_table), + ESTATSVAR(SoftErrorReason,SIGNED32, stack_table), + ESTATSVAR(SlowStart,UNSIGNED32, stack_table), + ESTATSVAR(CongAvoid,UNSIGNED32, stack_table), + ESTATSVAR(OtherReductions,UNSIGNED32, stack_table), + ESTATSVAR(CongOverCount,UNSIGNED32, stack_table), + ESTATSVAR(FastRetran,UNSIGNED32, stack_table), + ESTATSVAR(SubsequentTimeouts,UNSIGNED32, stack_table), + READFUNC(CurTimeoutCount,UNSIGNED32), + ESTATSVAR(AbruptTimeouts,UNSIGNED32, stack_table), + ESTATSVAR(SACKsRcvd,UNSIGNED32, stack_table), + ESTATSVAR(SACKBlocksRcvd,UNSIGNED32, stack_table), + ESTATSVAR(SendStall,UNSIGNED32, stack_table), + ESTATSVAR(DSACKDups,UNSIGNED32, stack_table), + ESTATSVAR(MaxMSS,UNSIGNED32, stack_table), + ESTATSVAR(MinMSS,UNSIGNED32, stack_table), + ESTATSVAR(SndInitial,UNSIGNED32, stack_table), + ESTATSVAR(RecInitial,UNSIGNED32, stack_table), + ESTATSVAR(CurRetxQueue,UNSIGNED32, stack_table), + ESTATSVAR(MaxRetxQueue,UNSIGNED32, stack_table), + READFUNC(CurReasmQueue,UNSIGNED32), + ESTATSVAR(MaxReasmQueue,UNSIGNED32, stack_table), + ESTATSVAR(EarlyRetrans,UNSIGNED32, stack_table), + ESTATSVAR(EarlyRetransDelay,UNSIGNED32, stack_table), +}; + +struct tcp_estats_var app_var_array[] = { + TPVAR32(SndUna,UNSIGNED32, snd_una), + TPVAR32(SndNxt,UNSIGNED32, snd_nxt), + ESTATSVAR(SndMax,UNSIGNED32, app_table), + HCINF32(ThruOctetsAcked,UNSIGNED32, app_table), + ESTATSVARN(HCThruOctetsAcked,UNSIGNED64, ThruOctetsAcked, app_table), + TPVAR32(RcvNxt,UNSIGNED32, rcv_nxt), + HCINF32(ThruOctetsReceived,UNSIGNED32, app_table), + ESTATSVARN(HCThruOctetsReceived,UNSIGNED64, ThruOctetsReceived, + app_table), + READFUNC(CurAppWQueue,UNSIGNED32), + ESTATSVAR(MaxAppWQueue,UNSIGNED32, app_table), + READFUNC(CurAppRQueue,UNSIGNED32), + ESTATSVAR(MaxAppRQueue,UNSIGNED32, app_table), +}; + +struct tcp_estats_var tune_var_array[] = { + RWFUNC(LimCwnd,UNSIGNED32), + RWFUNC(LimSsthresh,UNSIGNED32), + RWFUNC(LimRwin,UNSIGNED32), + READFUNC(LimMSS,UNSIGNED32), +}; + +struct tcp_estats_var extras_var_array[] = { + ESTATSVAR(OtherReductionsCV, UNSIGNED32, extras_table), + ESTATSVAR(OtherReductionsCM, UNSIGNED32, extras_table), +}; + +struct tcp_estats_var *estats_var_array[] = { + perf_var_array, + path_var_array, + stack_var_array, + app_var_array, + tune_var_array, + extras_var_array +}; +EXPORT_SYMBOL(estats_var_array); + +void tcp_estats_find_var_by_iname(struct tcp_estats_var **var, const char *name) +{ + int i, j; + + *var = NULL; + for (i = 0; i < MAX_TABLE; i++) { + for (j = 0; j < max_index[i]; j++) { + if (strnicmp(estats_var_array[i][j].name, + name, 21) == 0) { + *var = &estats_var_array[i][j]; + return; + } + } + } +} +EXPORT_SYMBOL(tcp_estats_find_var_by_iname); + +void tcp_estats_read_connection_spec(struct tcp_estats_connection_spec *spec, + struct tcp_estats *stats) +{ + struct tcp_estats_connection_table *connection_table = + stats->tables.connection_table; + memcpy(&spec->rem_addr[0], connection_table->RemAddress.data, 16); + memcpy(&spec->local_addr[0], connection_table->LocalAddress.data, 16); + spec->addr_type = connection_table->AddressType; + spec->rem_port = connection_table->RemPort; + spec->local_port = connection_table->LocalPort; +} +EXPORT_SYMBOL(tcp_estats_read_connection_spec); + +#else +#endif /* CONFIG_TCP_ESTATS */ diff --git a/net/ipv4/tcp_estats_nl.c b/net/ipv4/tcp_estats_nl.c new file mode 100644 index 0000000..febbfc1 --- /dev/null +++ b/net/ipv4/tcp_estats_nl.c @@ -0,0 +1,531 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef CONFIG_TCP_ESTATS + +static struct genl_family genl_estats_family = { + .id = GENL_ID_GENERATE, + .name = "tcp_estats", + .hdrsize = 0, + .version = 1, + .maxattr = NLE_ATTR_MAX, +}; + +static struct genl_multicast_group genl_estats_mc = { + .name = "tcp_estats_mc", +}; + +static const struct nla_policy spec_policy[NEA_4TUPLE_MAX+1] = { + [NEA_REM_ADDR] = { .type = NLA_BINARY, + .len = 16 }, + [NEA_LOCAL_ADDR] = { .type = NLA_BINARY, + .len = 16 }, + [NEA_ADDR_TYPE] = { .type = NLA_U8 }, + [NEA_REM_PORT] = { .type = NLA_U16 }, + [NEA_LOCAL_PORT] = { .type = NLA_U16 }, + [NEA_CID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy mask_policy[NEA_MASK_MAX+1] = { + [NEA_PERF_MASK] = { .type = NLA_U64 }, + [NEA_PATH_MASK] = { .type = NLA_U64 }, + [NEA_STACK_MASK] = { .type = NLA_U64 }, + [NEA_APP_MASK] = { .type = NLA_U64 }, + [NEA_TUNE_MASK] = { .type = NLA_U64 }, + [NEA_EXTRAS_MASK] = { .type = NLA_U64 }, +}; + +static const struct nla_policy write_policy[NEA_WRITE_MAX+1] = { + [NEA_WRITE_VAR] = { .type = NLA_STRING }, + [NEA_WRITE_VAL] = { .type = NLA_U32 }, +}; + +static int +genl_list_conns(struct sk_buff *skb, struct genl_info *info) +{ + + struct sk_buff *msg = NULL; + void *hdr = NULL; + struct nlattr *nest; + struct tcp_estats *stats; + struct tcp_estats_connection_spec spec; + + int tmpid = 0; + + while (1) { + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (msg == NULL) + return -ENOMEM; + + hdr = genlmsg_put(msg, 0, 0, &genl_estats_family, 0, TCPE_CMD_LIST_CONNS); + if (hdr == NULL) + goto nlmsg_failure; + + spin_lock(&tcp_estats_idr_lock); + stats = idr_get_next(&tcp_estats_idr, &tmpid); + spin_unlock(&tcp_estats_idr_lock); + + if (stats == NULL) + break; + + tcp_estats_read_connection_spec(&spec, stats); + + nest = nla_nest_start(msg, NLE_ATTR_4TUPLE | NLA_F_NESTED); + + nla_put(msg, NEA_REM_ADDR, 16, &spec.rem_addr[0]); + nla_put_u16(msg, NEA_REM_PORT, spec.rem_port); + nla_put(msg, NEA_LOCAL_ADDR, 16, &spec.local_addr[0]); + nla_put_u16(msg, NEA_LOCAL_PORT, spec.local_port); + nla_put_u8(msg, NEA_ADDR_TYPE, spec.addr_type); + nla_put_u32(msg, NEA_CID, tmpid); + + nla_nest_end(msg, nest); + + genlmsg_end(msg, hdr); + genlmsg_unicast(sock_net(skb->sk), msg, info->snd_portid); + + tmpid = tmpid + 1; + } + + return 0; + +nlmsg_failure: + printk(KERN_DEBUG "nlmsg_failure\n"); + + return -ENOBUFS; +} + +static int +genl_read_vars(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg = NULL; + void *hdr = NULL; + struct nlattr *tb[NEA_4TUPLE_MAX+1]; + struct nlattr *tb_mask[NEA_MASK_MAX+1] = {}; + struct nlattr *nest[MAX_TABLE]; + struct nlattr *nest_time; + struct nlattr *nest_spec; + struct tcp_estats_connection_spec spec; + + struct tcp_estats *stats; + int cid; + int ret; + int i, j, k; + int tblnum; + uint64_t mask; + uint64_t masks[MAX_TABLE] = { DEFAULT_PERF_MASK, DEFAULT_PATH_MASK, + DEFAULT_STACK_MASK, DEFAULT_APP_MASK, DEFAULT_TUNE_MASK, + DEFAULT_EXTRAS_MASK }; + + int if_mask[] = { [0 ... MAX_TABLE-1] = 0 }; + + union estats_val *val = NULL; + int numvars = TOTAL_NUM_VARS; + size_t valarray_size = numvars*sizeof(union estats_val); + + struct timeval read_time; + + struct sock *sk; + const struct cred *cred = get_current_cred(); + + if (!info->attrs[NLE_ATTR_4TUPLE]) + return -EINVAL; + + ret = nla_parse_nested(tb, NEA_4TUPLE_MAX, info->attrs[NLE_ATTR_4TUPLE], + spec_policy); + + if (ret < 0) + goto nla_parse_failure; + + if(!tb[NEA_CID]) + goto nla_parse_failure; + + cid = nla_get_u32(tb[NEA_CID]); + + if (cid < 1) + goto nla_parse_failure; + + ret = nla_parse_nested(tb_mask, NEA_MASK_MAX, + info->attrs[NLE_ATTR_MASK], mask_policy); + + if (ret < 0) + goto nla_parse_failure; + + if (tb_mask[NEA_PERF_MASK]) { + masks[PERF_TABLE] = nla_get_u64(tb_mask[NEA_PERF_MASK]); + if_mask[PERF_TABLE] = 1; + } + if (tb_mask[NEA_PATH_MASK]) { + masks[PATH_TABLE] = nla_get_u64(tb_mask[NEA_PATH_MASK]); + if_mask[PATH_TABLE] = 1; + } + if (tb_mask[NEA_STACK_MASK]) { + masks[STACK_TABLE] = nla_get_u64(tb_mask[NEA_STACK_MASK]); + if_mask[STACK_TABLE] = 1; + } + if (tb_mask[NEA_APP_MASK]) { + masks[APP_TABLE] = nla_get_u64(tb_mask[NEA_APP_MASK]); + if_mask[APP_TABLE] = 1; + } + if (tb_mask[NEA_TUNE_MASK]) { + masks[TUNE_TABLE] = nla_get_u64(tb_mask[NEA_TUNE_MASK]); + if_mask[TUNE_TABLE] = 1; + } + if (tb_mask[NEA_EXTRAS_MASK]) { + masks[EXTRAS_TABLE] = nla_get_u64(tb_mask[NEA_EXTRAS_MASK]); + if_mask[EXTRAS_TABLE] = 1; + } + + rcu_read_lock(); + stats = idr_find(&tcp_estats_idr, cid); + rcu_read_unlock(); + if (stats == NULL) + return -EINVAL; + + tcp_estats_use(stats); + + sk = stats->sk; + + if (!stats->ids) { + read_lock_bh(&sk->sk_callback_lock); + stats->uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : + GLOBAL_ROOT_UID; + stats->gid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_gid : + GLOBAL_ROOT_GID; + read_unlock_bh(&sk->sk_callback_lock); + + stats->ids = 1; + } + + if (!(capable(CAP_SYS_ADMIN) || + (stats->uid == cred->uid) || + (stats->gid == cred->gid))) { + tcp_estats_unuse(stats); + return -EACCES; + } + + val = kmalloc(valarray_size, GFP_KERNEL); + if (!val) + return -ENOMEM; + + do_gettimeofday(&read_time); + + lock_sock(stats->sk); + + for (tblnum = 0; tblnum < MAX_TABLE; tblnum++) { + if (if_mask[tblnum]) { + i = 0; + mask = masks[tblnum]; + while ((i < max_index[tblnum]) && mask) { + j = __builtin_ctzl(mask); + mask = mask >> j; + i += j; + + k = single_index(tblnum, i); + read_tcp_estats(&(val[k]), stats, + &(estats_var_array[tblnum][i])); + + mask = mask >> 1; + i++; + } + } else { + for (i = 0; i < max_index[tblnum]; i++) { + k = single_index(tblnum, i); + read_tcp_estats(&(val[k]), stats, + &(estats_var_array[tblnum][i])); + + } + } + } + + release_sock(stats->sk); + + tcp_estats_unuse(stats); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (msg == NULL) + goto nlmsg_failure; + + hdr = genlmsg_put(msg, 0, 0, &genl_estats_family, 0, + TCPE_CMD_READ_VARS); + if (hdr == NULL) + goto nlmsg_failure; + + nest_time = nla_nest_start(msg, NLE_ATTR_TIME | NLA_F_NESTED); + if (nla_put_u32(msg, NEA_TIME_SEC, + lower_32_bits(read_time.tv_sec))) + goto nla_put_failure; + if (nla_put_u32(msg, NEA_TIME_USEC, + lower_32_bits(read_time.tv_usec))) + goto nla_put_failure; + nla_nest_end(msg, nest_time); + + tcp_estats_read_connection_spec(&spec, stats); + + nest_spec = nla_nest_start(msg, NLE_ATTR_4TUPLE | NLA_F_NESTED); + + nla_put(msg, NEA_REM_ADDR, 16, &spec.rem_addr[0]); + nla_put_u16(msg, NEA_REM_PORT, spec.rem_port); + nla_put(msg, NEA_LOCAL_ADDR, 16, &spec.local_addr[0]); + nla_put_u16(msg, NEA_LOCAL_PORT, spec.local_port); + nla_put_u8(msg, NEA_ADDR_TYPE, spec.addr_type); + nla_put_u32(msg, NEA_CID, cid); + + nla_nest_end(msg, nest_spec); + + for (tblnum = 0; tblnum < MAX_TABLE; tblnum++) { + switch (tblnum) { + case PERF_TABLE: + nest[tblnum] = nla_nest_start(msg, NLE_ATTR_PERF | NLA_F_NESTED); + break; + case PATH_TABLE: + nest[tblnum] = nla_nest_start(msg, NLE_ATTR_PATH | NLA_F_NESTED); + break; + case STACK_TABLE: + nest[tblnum] = nla_nest_start(msg, NLE_ATTR_STACK | NLA_F_NESTED); + break; + case APP_TABLE: + nest[tblnum] = nla_nest_start(msg, NLE_ATTR_APP | NLA_F_NESTED); + break; + case TUNE_TABLE: + nest[tblnum] = nla_nest_start(msg, NLE_ATTR_TUNE | NLA_F_NESTED); + break; + case EXTRAS_TABLE: + nest[tblnum] = nla_nest_start(msg, NLE_ATTR_EXTRAS | NLA_F_NESTED); + break; + } + if (!nest[tblnum]) + goto nla_put_failure; + + i = 0; + mask = masks[tblnum]; + while ((i < max_index[tblnum]) && mask) { + j = __builtin_ctzl(mask); + mask = mask >> j; + i += j; + + k = single_index(tblnum, i); + + switch (estats_var_array[tblnum][i].type) { + + case TCP_ESTATS_UNSIGNED64: + if (nla_put_u64(msg, i, val[k].o)) + goto nla_put_failure; + break; + case TCP_ESTATS_UNSIGNED32: + if (nla_put_u32(msg, i, val[k].t)) + goto nla_put_failure; + break; + case TCP_ESTATS_SIGNED32: + if (nla_put_u32(msg, i, val[k].s)) + goto nla_put_failure; + break; + case TCP_ESTATS_UNSIGNED16: + if (nla_put_u16(msg, i, val[k].w)) + goto nla_put_failure; + break; + case TCP_ESTATS_UNSIGNED8: + if (nla_put_u8(msg, i, val[k].b)) + goto nla_put_failure; + break; + default: + break; + } + + mask = mask >> 1; + i++; + } + nla_nest_end(msg, nest[tblnum]); + } + genlmsg_end(msg, hdr); + + genlmsg_unicast(sock_net(skb->sk), msg, info->snd_portid); + + kfree(val); + + return 0; + +nlmsg_failure: + printk(KERN_DEBUG "nlmsg_failure\n"); + +nla_put_failure: + printk(KERN_DEBUG "nla_put_failure\n"); + genlmsg_cancel(msg, hdr); + kfree_skb(msg); + kfree(val); + + return -ENOBUFS; + +nla_parse_failure: + printk(KERN_DEBUG "nla_parse_failure\n"); + + return -EINVAL; +} + +static int +genl_write_var(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb_tuple[NEA_4TUPLE_MAX+1]; + struct nlattr *tb_write[NEA_WRITE_MAX+1]; + int ret; + int cid = 0; + char name[21]; + struct tcp_estats *stats; + struct tcp_estats_var *var = NULL; + uint32_t val; + + struct sock *sk; + const struct cred *cred = get_current_cred(); + + if (!info->attrs[NLE_ATTR_4TUPLE]) + return -EINVAL; + + ret = nla_parse_nested(tb_tuple, NEA_4TUPLE_MAX, + info->attrs[NLE_ATTR_4TUPLE], spec_policy); + + if (ret < 0) + goto nla_parse_failure; + + if(!tb_tuple[NEA_CID]) + goto nla_parse_failure; + + cid = nla_get_u32(tb_tuple[NEA_CID]); + + if (cid < 1) + goto nla_parse_failure; + + if (!info->attrs[NLE_ATTR_WRITE]) + return -EINVAL; + + ret = nla_parse_nested(tb_write, NEA_WRITE_MAX, + info->attrs[NLE_ATTR_WRITE], write_policy); + + if (ret < 0) + goto nla_parse_failure; + + if(!tb_write[NEA_WRITE_VAR]) + goto nla_parse_failure; + + nla_strlcpy(name, tb_write[NEA_WRITE_VAR], 21); + + tcp_estats_find_var_by_iname(&var, name); + + if (var == NULL) return -EINVAL; + + if (!tb_write[NEA_WRITE_VAL]) + goto nla_parse_failure; + + val = nla_get_u32(tb_write[NEA_WRITE_VAL]); + + rcu_read_lock(); + stats = idr_find(&tcp_estats_idr, cid); + rcu_read_unlock(); + if (stats == NULL) + return -EINVAL; + + tcp_estats_use(stats); + + sk = stats->sk; + + if (!stats->ids) { + read_lock_bh(&sk->sk_callback_lock); + stats->uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : + GLOBAL_ROOT_UID; + stats->gid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_gid : + GLOBAL_ROOT_GID; + read_unlock_bh(&sk->sk_callback_lock); + + stats->ids = 1; + } + + if (!(capable(CAP_SYS_ADMIN) || (stats->uid == cred->uid))) { + tcp_estats_unuse(stats); + return -EACCES; + } + + lock_sock(stats->sk); + ret = write_tcp_estats(&val, stats, var); + release_sock(stats->sk); + + tcp_estats_unuse(stats); + + if (ret == -1) + return -EPERM; + + return 0; + +nla_parse_failure: + printk(KERN_DEBUG "nla_parse_failure\n"); + + return -EINVAL; +} + +static struct genl_ops genl_estats_ops[] = { + { + .cmd = TCPE_CMD_READ_VARS, + .doit = genl_read_vars, + }, + { + .cmd = TCPE_CMD_WRITE_VAR, + .doit = genl_write_var, + }, + { + .cmd = TCPE_CMD_LIST_CONNS, + .doit = genl_list_conns, + }, +}; + +static int __init tcp_estats_nl_init(void) +{ + int ret = -EINVAL; + int i; + + ret = genl_register_family(&genl_estats_family); + if (ret < 0) + goto err; + + for (i = 0; i < ARRAY_SIZE(genl_estats_ops); i++) { + ret = genl_register_ops(&genl_estats_family, + &genl_estats_ops[i]); + if (ret < 0) + goto err_unregister; + } + + ret = genl_register_mc_group(&genl_estats_family, &genl_estats_mc); + if (ret < 0) + goto err_unregister; + + printk(KERN_INFO "tcp_estats netlink module initialized.\n"); + + return ret; + +err_unregister: + genl_unregister_family(&genl_estats_family); +err: + return ret; +} + +void __exit tcp_estats_nl_exit(void) +{ + genl_unregister_family(&genl_estats_family); + + printk(KERN_INFO "tcp_estats netlink module exiting.\n"); +} + +module_init(tcp_estats_nl_init); +module_exit(tcp_estats_nl_exit); + +MODULE_LICENSE("GPL"); + +#else +#endif /* CONFIG_TCP_ESTATS */ diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index c1a8175..2370726 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -250,6 +250,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) tp->snd_cwnd_cnt += ca->pkts_acked; ca->pkts_acked = 1; + TCP_ESTATS_VAR_INC(tp, stack_table, CongAvoid); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 59163c8..93eb347 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -235,10 +235,13 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s tcp_enter_quickack_mode((struct sock *)tp); break; case INET_ECN_CE: + TCP_ESTATS_VAR_INC(tp, path_table, CERcvd); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode((struct sock *)tp); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + } else { + TCP_ESTATS_VAR_INC(tp, path_table, ECESent); } /* fallinto */ default: @@ -1574,6 +1577,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, state.flag = 0; state.reord = tp->packets_out; + TCP_ESTATS_VAR_INC(tp, stack_table, SACKsRcvd); + TCP_ESTATS_VAR_ADD(tp, stack_table, SACKBlocksRcvd, num_sacks); + if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) tp->fackets_out = 0; @@ -2044,6 +2050,8 @@ void tcp_enter_loss(struct sock *sk, int how) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; + TCP_ESTATS_UPDATE(tp, tcp_estats_update_congestion(tp)); + /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { @@ -2320,8 +2328,12 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) */ if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out && (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) && - !tcp_may_send_now(sk)) - return !tcp_pause_early_retransmit(sk, flag); + !tcp_may_send_now(sk)) { + int early_retrans = !tcp_pause_early_retransmit(sk, flag); + if (early_retrans) + TCP_ESTATS_VAR_INC(tp, stack_table, EarlyRetrans); + return early_retrans; + } return false; } @@ -2458,9 +2470,15 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) */ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) { - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp) + tcp_max_burst(tp)); - tp->snd_cwnd_stamp = tcp_time_stamp; + u32 pkts = tcp_packets_in_flight(tp) + tcp_max_burst(tp); + + if (pkts < tp->snd_cwnd) { + tp->snd_cwnd = pkts; + tp->snd_cwnd_stamp = tcp_time_stamp; + + TCP_ESTATS_VAR_INC(tp, stack_table, OtherReductions); + TCP_ESTATS_VAR_INC(tp, extras_table, OtherReductionsCM); + } } /* Nothing was retransmitted or returned timestamp is less @@ -2520,6 +2538,7 @@ static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh) if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; TCP_ECN_withdraw_cwr(tp); + TCP_ESTATS_VAR_INC(tp, stack_table, CongOverCount); } } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); @@ -2545,11 +2564,14 @@ static bool tcp_try_undo_recovery(struct sock *sk) */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); tcp_undo_cwr(sk, true); - if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) { mib_idx = LINUX_MIB_TCPLOSSUNDO; - else + TCP_ESTATS_VAR_INC(tp, stack_table, + SpuriousRtoDetected); + } else { mib_idx = LINUX_MIB_TCPFULLUNDO; - + TCP_ESTATS_VAR_INC(tp, stack_table, SpuriousFrDetected); + } NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->undo_marker = 0; } @@ -2734,6 +2756,7 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) tcp_init_cwnd_reduction(sk, set_ssthresh); tcp_set_ca_state(sk, TCP_CA_CWR); } + TCP_ESTATS_UPDATE(tp, tcp_estats_update_congestion(tp)); } static void tcp_try_keep_open(struct sock *sk) @@ -2759,8 +2782,10 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked) if (!tp->frto_counter && !tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; - if (flag & FLAG_ECE) + if (flag & FLAG_ECE) { tcp_enter_cwr(sk, 1); + TCP_ESTATS_VAR_INC(tp, path_table, ECNsignals); + } if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); @@ -2934,6 +2959,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, } break; + case TCP_CA_Disorder: + TCP_ESTATS_VAR_INC(tp, path_table, NonRecovDAEpisodes); + break; + case TCP_CA_Recovery: if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); @@ -2981,6 +3010,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); + + if (icsk->icsk_ca_state == TCP_CA_Disorder) + TCP_ESTATS_VAR_INC(tp, path_table, NonRecovDA); + if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag, newly_acked_sacked); return; @@ -3000,6 +3033,8 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, /* Otherwise enter Recovery state */ tcp_enter_recovery(sk, (flag & FLAG_ECE)); fast_rexmit = 1; + TCP_ESTATS_UPDATE(tp, tcp_estats_update_congestion(tp)); + TCP_ESTATS_VAR_INC(tp, stack_table, FastRetran); } if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk))) @@ -3012,6 +3047,7 @@ void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt) { tcp_rtt_estimator(sk, seq_rtt); tcp_set_rto(sk); + TCP_ESTATS_UPDATE(tcp_sk(sk), tcp_estats_update_rtt(sk, seq_rtt)); inet_csk(sk)->icsk_backoff = 0; } EXPORT_SYMBOL(tcp_valid_rtt_meas); @@ -3123,6 +3159,7 @@ void tcp_resume_early_retransmit(struct sock *sk) if (!tp->do_early_retrans) return; + TCP_ESTATS_VAR_INC(tp, stack_table, EarlyRetransDelay); tcp_enter_recovery(sk, false); tcp_update_scoreboard(sk, 1); tcp_xmit_retransmit_queue(sk); @@ -3397,9 +3434,11 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 tp->max_window = nwin; tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); } + TCP_ESTATS_UPDATE(tp, tcp_estats_update_rwin_rcvd(tp)); } } + TCP_ESTATS_UPDATE(tp, tcp_estats_update_acked(tp, ack)); tp->snd_una = ack; return flag; @@ -3545,6 +3584,7 @@ static bool tcp_process_frto(struct sock *sk, int flag) tp->frto_counter = 0; tp->undo_marker = 0; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); + TCP_ESTATS_VAR_INC(tp, stack_table, SpuriousRtoDetected); } return false; } @@ -3604,11 +3644,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int pkts_acked = 0; int previous_packets_out = 0; bool frto_cwnd = false; + int prior_state = icsk->icsk_ca_state; /* If the ack is older than previous acks * then we can probably ignore it. */ if (before(ack, prior_snd_una)) { + TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors); + TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason, + TCP_ESTATS_SOFTERROR_BELOW_ACK_WINDOW); + /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - tp->max_window)) { tcp_send_challenge_ack(sk); @@ -3620,14 +3665,22 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* If the ack includes data we haven't sent yet, discard * this segment (RFC793 Section 3.9). */ - if (after(ack, tp->snd_nxt)) + if (after(ack, tp->snd_nxt)) { + TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors); + TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason, + TCP_ESTATS_SOFTERROR_ABOVE_ACK_WINDOW); goto invalid_ack; + } if (tp->early_retrans_delayed) tcp_rearm_rto(sk); - if (after(ack, prior_snd_una)) + if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; + if (icsk->icsk_ca_state == TCP_CA_Disorder) + TCP_ESTATS_VAR_ADD(tp, path_table, SumOctetsReordered, + ack - prior_snd_una); + } prior_fackets = tp->fackets_out; prior_in_flight = tcp_packets_in_flight(tp); @@ -3644,6 +3697,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * Note, we use the fact that SND.UNA>=SND.WL2. */ tcp_update_wl(tp, ack_seq); + TCP_ESTATS_UPDATE(tp, tcp_estats_update_acked(tp, ack)); tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; @@ -3696,6 +3750,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, prior_packets, is_dupack, flag); + + if (icsk->icsk_ca_state == TCP_CA_Open && + prior_state >= TCP_CA_CWR) + TCP_ESTATS_UPDATE(tp, tcp_estats_update_post_congestion(tp)); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) tcp_cong_avoid(sk, ack, prior_in_flight); @@ -4337,6 +4395,7 @@ static void tcp_ofo_queue(struct sock *sk) __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); + TCP_ESTATS_UPDATE(tp, tcp_estats_update_rcvd(tp, TCP_SKB_CB(skb)->end_seq)); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tcp_hdr(skb)->fin) tcp_fin(sk); @@ -4428,6 +4487,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + TCP_ESTATS_UPDATE(tp, tcp_estats_update_recvq(sk)); + TCP_ESTATS_VAR_INC(tp, path_table, DupAcksOut); + skb1 = skb_peek_tail(&tp->out_of_order_queue); if (!skb1) { /* Initial out of order segment, build 1 SACK. */ @@ -4438,6 +4500,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->end_seq; } __skb_queue_head(&tp->out_of_order_queue, skb); + TCP_ESTATS_VAR_INC(tp, path_table, DupAckEpisodes); goto end; } @@ -4637,6 +4700,9 @@ queue_and_out: eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } + TCP_ESTATS_UPDATE( + tp, + tcp_estats_update_rcvd(tp, TCP_SKB_CB(skb)->end_seq)); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (skb->len) tcp_event_data_recv(sk, skb); @@ -4658,6 +4724,8 @@ queue_and_out: tcp_fast_path_check(sk); + TCP_ESTATS_UPDATE(tp, tcp_estats_update_recvq(sk)); + if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) @@ -4967,6 +5035,8 @@ void tcp_cwnd_application_limited(struct sock *sk) if (win_used < tp->snd_cwnd) { tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; + TCP_ESTATS_VAR_INC(tp, stack_table, OtherReductions); + TCP_ESTATS_VAR_INC(tp, extras_table, OtherReductionsCV); } tp->snd_cwnd_used = 0; } @@ -5282,6 +5352,9 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors); + TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason, + TCP_ESTATS_SOFTERROR_BELOW_TS_WINDOW); tcp_send_dupack(sk, skb); goto discard; } @@ -5301,6 +5374,13 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, goto syn_challenge; tcp_send_dupack(sk, skb); } + + TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors); + TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason, + before(TCP_SKB_CB(skb)->end_seq, tp->rcv_wup) ? + TCP_ESTATS_SOFTERROR_BELOW_DATA_WINDOW : + TCP_ESTATS_SOFTERROR_ABOVE_DATA_WINDOW); + goto discard; } @@ -5444,6 +5524,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, return 0; } else { /* Header too small */ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors); + TCP_ESTATS_VAR_SET(tp, stack_table, + SoftErrorReason, + TCP_ESTATS_SOFTERROR_OTHER); goto discard; } } else { @@ -5482,6 +5566,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len); + TCP_ESTATS_UPDATE(tp, tcp_estats_update_rcvd(tp, TCP_SKB_CB(skb)->end_seq)); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); } @@ -5509,10 +5594,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ + TCP_ESTATS_UPDATE(tp, tcp_estats_update_rcvd(tp, TCP_SKB_CB(skb)->end_seq)); eaten = tcp_queue_rcv(sk, skb, tcp_header_len, &fragstolen); } + TCP_ESTATS_UPDATE(tp, tcp_estats_update_recvq(sk)); tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { @@ -5570,6 +5657,9 @@ step5: csum_error: TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + TCP_ESTATS_VAR_INC(tp, stack_table, SoftErrors); + TCP_ESTATS_VAR_SET(tp, stack_table, SoftErrorReason, + TCP_ESTATS_SOFTERROR_DATA_CHECKSUM); discard: __kfree_skb(skb); @@ -5793,6 +5883,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, smp_mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + tcp_estats_establish(sk); + tcp_finish_connect(sk, skb); if ((tp->syn_fastopen || tp->syn_data) && @@ -6019,6 +6112,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); sk->sk_state_change(sk); + tcp_estats_establish(sk); /* Note, that this wakeup is only for marginal * crossed SYN case. Passively open sockets diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d09203c..6327f91 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -212,7 +212,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; - +#ifdef CONFIG_TCP_ESTATS + tp->rx_opt.rec_mss = 0; +#endif + /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket * lock select source port, enter ourselves into the hash tables and @@ -239,7 +242,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr, inet->inet_sport, usin->sin_port); - inet->inet_id = tp->write_seq ^ jiffies; err = tcp_connect(sk); @@ -1519,6 +1521,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = TCP_MSS_DEFAULT; tmp_opt.user_mss = tp->rx_opt.user_mss; +#ifdef CONFIG_TCP_ESTATS + tmp_opt.rec_mss = 0; +#endif tcp_parse_options(skb, &tmp_opt, &hash_location, 0, want_cookie ? NULL : &foc); @@ -1701,6 +1706,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (!newsk) goto exit_nonewsk; + tcp_estats_create(newsk, TCP_ESTATS_ADDRTYPE_IPV4, TCP_ESTATS_ACTIVE); + newsk->sk_gso_type = SKB_GSO_TCPV4; inet_sk_rx_dst_set(newsk, skb); @@ -2018,6 +2025,7 @@ process: skb->dev = NULL; bh_lock_sock_nested(sk); + TCP_ESTATS_UPDATE(tcp_sk(sk), tcp_estats_update_segrecv(tcp_sk(sk), skb)); ret = 0; if (!sock_owned_by_user(sk)) { #ifdef CONFIG_NET_DMA @@ -2038,6 +2046,7 @@ process: NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } + TCP_ESTATS_UPDATE(tcp_sk(sk), tcp_estats_update_finish_segrecv(tcp_sk(sk))); bh_unlock_sock(sk); sock_put(sk); @@ -2159,6 +2168,8 @@ static int tcp_v4_init_sock(struct sock *sk) tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; #endif + tcp_estats_create(sk, TCP_ESTATS_ADDRTYPE_IPV4, TCP_ESTATS_INACTIVE); + return 0; } @@ -2197,6 +2208,8 @@ void tcp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); + tcp_estats_destroy(sk); + /* TCP Cookie Transactions */ if (tp->cookie_values != NULL) { kref_put(&tp->cookie_values->kref, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2f672e7..d116f73 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -510,6 +510,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; +#ifdef CONFIG_TCP_ESTATS + newtp->rx_opt.rec_mss = req->mss; +#endif TCP_ECN_openreq_child(newtp, req); newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 84559e9..2133cab 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -79,6 +79,7 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + TCP_ESTATS_UPDATE(tp, tcp_estats_update_snd_nxt(tp)); /* Don't override Nagle indefinitely with F-RTO */ if (tp->frto_counter == 2) @@ -281,6 +282,7 @@ static u16 tcp_select_window(struct sock *sk) } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; + TCP_ESTATS_UPDATE(tp, tcp_estats_update_rwin_sent(tp)); /* Make sure we do not exceed the maximum possible * scaled window. @@ -1002,6 +1004,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct tcp_md5sig_key *md5; struct tcphdr *th; int err; +#ifdef CONFIG_TCP_ESTATS + int len; + __u32 seq; + __u32 end_seq; + int tcp_flags; + int pcount; +#endif BUG_ON(!skb || !tcp_skb_pcount(skb)); @@ -1105,11 +1114,29 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); +#ifdef CONFIG_TCP_ESTATS + /* If the skb isn't cloned, we can't reference it after + * calling queue_xmit, so copy everything we need here. */ + len = skb->len; + pcount = tcp_skb_pcount(skb); + seq = TCP_SKB_CB(skb)->seq; + end_seq = TCP_SKB_CB(skb)->end_seq; + tcp_flags = TCP_SKB_CB(skb)->tcp_flags; +#endif + err = icsk->icsk_af_ops->queue_xmit(skb, &inet->cork.fl); + + if (likely(!err)) { + TCP_ESTATS_UPDATE(tp, tcp_estats_update_segsend(sk, len, pcount, + seq, end_seq, + tcp_flags)); + } + if (likely(err <= 0)) return err; tcp_enter_cwr(sk, 1); + TCP_ESTATS_VAR_INC(tp, stack_table, SendStall); return net_xmit_eval(err); } @@ -1468,6 +1495,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) if (icsk->icsk_mtup.enabled) mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); tp->mss_cache = mss_now; + TCP_ESTATS_UPDATE(tp, tcp_estats_update_mss(tp)); return mss_now; } @@ -1673,11 +1701,13 @@ static unsigned int tcp_snd_test(const struct sock *sk, struct sk_buff *skb, tcp_init_tso_segs(sk, skb, cur_mss); if (!tcp_nagle_test(tp, skb, cur_mss, nonagle)) - return 0; + return -TCP_ESTATS_SNDLIM_SENDER; cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) - cwnd_quota = 0; + if (!cwnd_quota) + return -TCP_ESTATS_SNDLIM_CWND; + if (!tcp_snd_wnd_test(tp, skb, cur_mss)) + return -TCP_ESTATS_SNDLIM_RWIN; return cwnd_quota; } @@ -1691,7 +1721,7 @@ bool tcp_may_send_now(struct sock *sk) return skb && tcp_snd_test(sk, skb, tcp_current_mss(sk), (tcp_skb_is_last(sk, skb) ? - tp->nonagle : TCP_NAGLE_PUSH)); + tp->nonagle : TCP_NAGLE_PUSH)) > 0; } /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet @@ -1974,6 +2004,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, unsigned int tso_segs, sent_pkts; int cwnd_quota; int result; + int why = TCP_ESTATS_SNDLIM_SENDER; sent_pkts = 0; @@ -1998,20 +2029,27 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, goto repair; /* Skip network transmission */ cwnd_quota = tcp_cwnd_test(tp, skb); - if (!cwnd_quota) + if (!cwnd_quota) { + why = TCP_ESTATS_SNDLIM_CWND; break; + } - if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) + if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) { + why = TCP_ESTATS_SNDLIM_RWIN; break; + } if (tso_segs == 1) { if (unlikely(!tcp_nagle_test(tp, skb, mss_now, (tcp_skb_is_last(sk, skb) ? - nonagle : TCP_NAGLE_PUSH)))) + nonagle : TCP_NAGLE_PUSH)))) { break; + } } else { - if (!push_one && tcp_tso_should_defer(sk, skb)) + if (!push_one && tcp_tso_should_defer(sk, skb)) { + why = TCP_ESTATS_SNDLIM_TSODEFER; break; + } } /* TSQ : sk_wmem_alloc accounts skb truesize, @@ -2029,13 +2067,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sk->sk_gso_max_segs)); if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) { break; + } TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) + if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) { break; + } repair: /* Advance the send_head. This one is sent out. @@ -2050,6 +2090,8 @@ repair: break; } + TCP_ESTATS_UPDATE(tp, tcp_estats_update_sndlim(tp, why)); + if (likely(sent_pkts)) { if (tcp_in_cwnd_reduction(sk)) tp->prr_out += sent_pkts; @@ -3033,11 +3075,16 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; - TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + + TCP_ESTATS_VAR_SET(tp, stack_table, SndInitial, tp->write_seq); + TCP_ESTATS_VAR_SET(tp, app_table, SndMax, tp->write_seq); + TCP_ESTATS_UPDATE(tp, tcp_estats_update_snd_nxt(tp)); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); + return 0; } EXPORT_SYMBOL(tcp_connect); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b78aac3..05ba3f9 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -295,6 +295,7 @@ static void tcp_probe_timer(struct sock *sk) if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) return; } + TCP_ESTATS_UPDATE(tp, tcp_estats_update_timeout(sk)); if (icsk->icsk_probes_out > max_probes) { tcp_write_err(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0fce928..ec54614 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -287,6 +287,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, np->opt->opt_nflen); tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); +#ifdef CONFIG_TCP_ESTATS + tp->rx_opt.rec_mss = 0; +#endif inet->inet_dport = usin->sin6_port; @@ -988,6 +991,9 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); tmp_opt.user_mss = tp->rx_opt.user_mss; +#ifdef CONFIG_TCP_ESTATS + tmp_opt.rec_mss = 0; +#endif tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); if (tmp_opt.cookie_plus > 0 && @@ -1212,6 +1218,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (newsk == NULL) goto out_nonewsk; + tcp_estats_create(newsk, TCP_ESTATS_ADDRTYPE_IPV6, TCP_ESTATS_ACTIVE); + /* * No need to charge this sock to the relevant IPv6 refcnt debug socks * count here, tcp_create_openreq_child now does this for us, see the @@ -1548,6 +1556,7 @@ process: skb->dev = NULL; bh_lock_sock_nested(sk); + TCP_ESTATS_UPDATE(tcp_sk(sk), tcp_estats_update_segrecv(tcp_sk(sk), skb)); ret = 0; if (!sock_owned_by_user(sk)) { #ifdef CONFIG_NET_DMA @@ -1568,6 +1577,7 @@ process: NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } + TCP_ESTATS_UPDATE(tcp_sk(sk), tcp_estats_update_finish_segrecv(tcp_sk(sk))); bh_unlock_sock(sk); sock_put(sk); @@ -1753,6 +1763,7 @@ static int tcp_v6_init_sock(struct sock *sk) #ifdef CONFIG_TCP_MD5SIG tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; #endif + tcp_estats_create(sk, TCP_ESTATS_ADDRTYPE_IPV6, TCP_ESTATS_INACTIVE); return 0; }