《TCPIP详解卷2》笔记:TCP的输⼊函数:tcp_input
TCP输⼊处理是系统中最长的⼀部分代码,tcp_input函数约有1100⾏代码(预警!)。它完全遵循RFC793中定义的输⼊事件处理步骤,这些步骤详细定义了如何根据连接的当前状态,处理不同的输⼊报⽂段。
当发现分组IP⾸部中的协议字段是TCP协议时,IP协议的软中断处理函数ipintr调⽤tcp_input函数进⾏处理。tcp_input函数我删去了处理URG标志的流程,它的代码如下:
/*
* TCP input routine, follows pages 65-76 of the
* protocol specification dated September, 1981 very closely.
*/
void
tcp_input(m, iphlen)
register struct mbuf *m;
int iphlen;
{
register struct tcpiphdr *ti;
register struct inpcb *inp;
u_char *optp = NULL;
int optlen;
int len, tlen, off;
register struct tcpcb *tp = 0;
register int tiflags;
struct socket *so;
int todrop, acked, ourfinisacked, needoutput = 0;
short ostate;
struct in_addr laddr;
int dropsocket = 0;
int iss = 0;
u_long tiwin, ts_val, ts_ecr;
int ts_present = 0;
/*
* Get IP and TCP header together in first mbuf.
* Note: IP leaves IP header in first mbuf.
*/
ti = mtod(m, struct tcpiphdr *);
if (iphlen > sizeof (struct ip)) /*IP⾸部包含选项?*/
ip_stripoptions(m, (struct mbuf *)0); /*过滤掉IP⾸部选项*/
if (m->m_len < sizeof (struct tcpiphdr)) { /*IP⾸部和TCP⾸部不在⼀个mbuf?*/
if ((m = m_pullup(m, sizeof (struct tcpiphdr))) == 0) { /*将IP⾸部和TCP⾸部调整到⼀个mbuf中*/
return;
}
ti = mtod(m, struct tcpiphdr *);
}
/
*
* Checksum extended TCP header and data.
*/
tlen = ((struct ip *)ti)->ip_len; /*经过IP层处理后,tlen是TCP报⽂段的长度*/
len = sizeof (struct ip) + tlen; /*IP数据报总长度*/
ti->ti_next = ti->ti_prev = 0;
ti->ti_x1 = 0;
ti->ti_len = (u_short)tlen;
HTONS(ti->ti_len);
if (ti->ti_sum = in_cksum(m, len)) { /*验证校验和失败,丢弃报⽂*/
goto drop;
}
/*
/*
* Check that TCP offset makes sense,
* pull out TCP options and adjust length. XXX
*/
off = ti->ti_off << 2; /*TCP⾸部的长度*/
if (off < sizeof (struct tcphdr) || off > tlen) { /*⽆效的TCP⾸部,丢弃*/
goto drop;
}
tlen -= off; /*数据的长度*/
ti->ti_len = tlen;
if (off > sizeof (struct tcphdr)) { /*TCP⾸部包含选项?*/
if (m->m_len < sizeof(struct ip) + off) { /*TCP⾸部和选项不在⼀个mbuf?*/
if ((m = m_pullup(m, sizeof (struct ip) + off)) == 0) { /*将IP⾸部和TCP⾸部调整到⼀个mbuf中*/*/
return;
}
ti = mtod(m, struct tcpiphdr *);
}
optlen = off - sizeof (struct tcphdr); /*TCP选项的长度*/
optp = mtod(m, u_char *) + sizeof (struct tcpiphdr); /*指向选项*/
/*
* Do quick retrieval of timestamp options ("options
* prediction?"). If timestamp is the only option and it's
* formatted as recommended in RFC 1323 appendix A, we
* quickly get the values now and not bother calling
* tcp_dooptions(), etc.
*/
if ((optlen == TCPOLEN_TSTAMP_APPA ||
(optlen > TCPOLEN_TSTAMP_APPA &&
optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
*(u_long *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
(ti->ti_flags & TH_SYN) == 0) { /*选项中只包含时间戳选项?*/
ts_present = 1;
ts_val = ntohl(*(u_long *)(optp + 4)); /*记录对端的时间戳*/
ts_ecr = ntohl(*(u_long *)(optp + 8)); /*记录本端回显的时间戳*/
optp = NULL; /* we've parsed the options */
}
}
tiflags = ti->ti_flags; /*获取TCP⾸部中的标志*/
/*
* Convert TCP protocol specific fields to host format.
*/
NTOHL(ti->ti_seq);
NTOHL(ti->ti_ack);
NTOHS(ti->ti_win);
NTOHS(ti->ti_urp);
/*
* Locate pcb for segment.
*/
findpcb:
inp = tcp_last_inpcb;
if (inp->inp_lport != ti->ti_dport ||
inp->inp_fport != ti->ti_sport ||
inp->inp_faddr.s_addr != ti->ti_src.s_addr ||
inp->inp_laddr.s_addr != ti->ti_dst.s_addr) {
inp = in_pcblookup(&tcb, ti->ti_src, ti->ti_sport,
ti->ti_dst, ti->ti_dport, INPLOOKUP_WILDCARD); /*根据源IP地址、源端⼝号、⽬的IP地址和⽬的端⼝号查合适的inpcb结构*/
if (inp)
tcp_last_inpcb = inp; /*记录最新查到的inpcb*/
++ps_pcbcachemiss;
}
/*
* If the state is CLOSED (i.e., TCB does not exist) then
* all data in the incoming segment is discarded.
* If the TCB exists but is in CLOSED state, it is embryonic,
* but should either do a listen or a connect soon.
*/
if (inp == 0) /*不到inpcb,丢弃报⽂,并发送RST*/
goto dropwithreset;
tp = intotcpcb(inp);
if (tp == 0) /*没有tcpcb?丢弃报⽂,并发送RST*/
goto dropwithreset;
if (tp->t_state == TCPS_CLOSED) /*连接状态为CLOSED,丢弃报⽂*/
goto drop;
/* Unscale the window into a 32-bit value. */
if ((tiflags & TH_SYN) == 0) /*计算通告的窗⼝⼤⼩*/
tiwin = ti->ti_win << tp->snd_scale;
else
tiwin = ti->ti_win;
so = inp->inp_socket;
if (so->so_options & SO_ACCEPTCONN) { /*该socket是监听socket?
SO_ACCEPTCONN标志在执⾏listen系统调⽤时被置上!*/
if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) { /*收到的不是纯SYN报⽂?*/
/*
* Note: dropwithreset makes sure we don't
* send a reset in response to a RST.
*/
if (tiflags & TH_ACK) { /*带ACK标志的⽆效的SYN报⽂,丢弃,并发送RST*/
goto dropwithreset;
}
goto drop; /*丢弃报⽂*/
}
so = sonewconn(so, 0); /*创建⼀个新的socket!*/
if (so == 0)
goto drop;
/*
* This is ugly, but ....
*
* Mark socket as temporary until we're
* committed to keeping it. The code at
* ``drop'' and ``dropwithreset'' check the
* flag dropsocket to see if the temporary
* socket created here should be discarded.
* We mark the socket as discardable until
* we're committed to it below in TCPS_LISTEN.
*/
dropsocket++;
inp = (struct inpcb *)so->so_pcb;
inp->inp_laddr = ti->ti_dst; /*设置新socket的本地地址*/
inp->inp_lport = ti->ti_dport; /*设置新socket的本地端⼝*/
tp = intotcpcb(inp);
tp->t_state = TCPS_LISTEN; /*连接状态设置为TCPS_LISTEN!会执⾏329⾏的流程*/
/* Compute proper scaling value from buffer space
*/
while (tp->request_r_scale < TCP_MAX_WINSHIFT && /*根据接收缓冲区⼤⼩计算窗⼝缩放因⼦*/ TCP_MAXWIN << tp->request_r_scale < so->so_rcv.sb_hiwat)
tp->request_r_scale++;
}
/*
* Segment received on connection.
* Segment received on connection.
* Reset idle time and keep-alive timer.
*/
tp->t_idle = 0; /*连接空闲时间清零*/
tp->t_timer[TCPT_KEEP] = tcp_keepidle; /*保活定时器复位*/
/*
* Process options if not in LISTEN state,
* else do it below (after getting remote address).
*/
if (optp && tp->t_state != TCPS_LISTEN) /*解析不是LISTEN状态的TCP选项*/
tcp_dooptions(tp, optp, optlen, ti,
&ts_present, &ts_val, &ts_ecr);
/*
* Header prediction: check for the two common cases
* of a uni-directional data xfer. If the packet has
* no control flags, is in-sequence, the window didn't
* change and we're not retransmitting, it's a
* candidate. If the length is zero and the ack moved
* forward, we're the sender side of the xfer. Just
* free the data acked & wake any higher level process
* that was blocked waiting for space. If the length
* is non-zero and the ack didn't move, we're the
* receiver side. If we're getting packets in-order
* (the reassembly queue is empty), add the data to
* the socket buffer and note that we need a delayed ack.
*/
/*⾸部预测算法,tcp_input函数的⼀条快速路径!*/
if (tp->t_state == TCPS_ESTABLISHED && /*连接处于ESTABLISHED状态*/
(tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && /*只有ACK标志*/ (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && /*没有时间戳或是新的时间戳*/
ti->ti_seq == tp->rcv_nxt && /*序列号是期望收到下⼀个数据的序列号*/
tiwin && tiwin == tp->snd_wnd && /*对端通告的窗⼝⼤⼩没有变化*/
tp->snd_nxt == tp->snd_max) { /*上⼀个报⽂段不是重传报⽂*/
if (ts_present && TSTMP_GEQ(ts_val, tp->ts_recent) &&
SEQ_LEQ(ti->ti_seq, tp->last_ack_sent)) { /*哪个时间戳需要回显,正确的算法*/
tp->ts_recent_age = tcp_now; /*最近⼀次ts_recent被更新的时间戳*/
tp->ts_recent = ts_val; /*对端发送的最新的有效时间戳*/
}
if (ti->ti_len == 0) { /*不包含任何数据*/
if (SEQ_GT(ti->ti_ack, tp->snd_una) &&
SEQ_LEQ(ti->ti_ack, tp->snd_max) &&
tp->snd_cwnd >= tp->snd_wnd) { /*有效的纯ACK报⽂段*/
/
*
* this is a pure ack for outstanding data.
*/
++ps_predack;
if (ts_present) /*如果有时间戳,根据时间戳就可计算报⽂的RTT,然后更新RTO*/
tcp_xmit_timer(tp, tcp_now-ts_ecr+1);
else if (tp->t_rtt && /*如果没有时间戳,根据t_rtt计算报⽂的RTT,然后更新RTO*/
SEQ_GT(ti->ti_ack, tp->t_rtseq))
tcp_xmit_timer(tp, tp->t_rtt);
acked = ti->ti_ack - tp->snd_una; /*ACK已确认的字节数*/
sbdrop(&so->so_snd, acked); /*丢弃发送缓冲区中已被确认的数据*/
tp->snd_una = ti->ti_ack; /*更新snd_una指针*/
m_freem(m); /*没有数据,可释放该mbuf*/
/*
* If all outstanding data are acked, stop
* retransmit timer, otherwise restart timer
* retransmit timer, otherwise restart timer
* using current (possibly backed-off) value.
* If process is waiting for space,
* wakeup/selwakeup/signal. If data
tcpip协议pdf* are ready to send, let tcp_output
* decide between more output or persist.
*/
if (tp->snd_una == tp->snd_max) /*没有数据需要被确认?重传定时器复位*/
tp->t_timer[TCPT_REXMT] = 0;
else if (tp->t_timer[TCPT_PERSIST] == 0) /*否则不是持续状态?启动重传定时器*/
tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
if (so->so_snd.sb_flags & SB_NOTIFY) /*唤醒在发送缓冲区等待的进程*/
sowwakeup(so);
if (so->so_snd.sb_cc) /*还有待发送的数据,调⽤tcp_output发送*/
(void) tcp_output(tp);
return;
}
} else if (ti->ti_ack == tp->snd_una && /*正常数据报⽂段,ACK标志置位,但是未确认任何数据*/ tp->seg_next == (struct tcpiphdr *)tp &&
ti->ti_len <= sbspace(&so->so_rcv)) {
/*
* this is a pure, in-sequence data packet
* with nothing on the reassembly queue and
* we have enough buffer space to take it.
*/
++ps_preddat;
tp->rcv_nxt += ti->ti_len; /*更新rcv_nxt*/
/*
* Drop TCP, IP headers and TCP options then add data
* to socket buffer.
*/
m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); /*丢弃IP⾸部,TCP⾸部*/
m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
sbappend(&so->so_rcv, m); /*将数据添加到接收缓冲区*/
sorwakeup(so); /*唤醒在接收缓冲区等待的进程*/
tp->t_flags |= TF_DELACK; /*置延迟的ACK标志*/
return;
}
} /*⾸部预测算法结束*/
/*
* Drop TCP, IP headers and TCP options.
*/
m->m_data += sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr); /*丢弃IP⾸部,TCP⾸部*/
m->m_len -= sizeof(struct tcpiphdr)+off-sizeof(struct tcphdr);
/*
* Calculate amount of space in receive window,
* and then do TCP input processing.
* Receive window is amount of space in rcv queue,
* but not less than advertised window.
*/
{ int win;
win = sbspace(&so->so_rcv); /*接收缓冲区可⽤空间*/
if (win < 0)
win = 0;
tp->rcv_wnd = max(win, (int)(tp->rcv_adv - tp->rcv_nxt)); /*更新接收窗⼝⼤⼩*/
}
switch (tp->t_state) {
/*
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论