IP包的生成和发送接口(1)
http://blog.sina.com.cn/s/indexlist_1657348185_2.html
?
?
IP包的生成和發(fā)送接口
====================
(1) Linux內核中有3種基本的IP包生成器, 它們分別為ip_build_xmit(), ip_queue_xmit(),
ip_build_and_send_pkt(). ip_build_and_send_pkt()是一簡單的IP包頭封裝接口,
它接照輸入包的路由添加一個IP包頭后直接輸出,不進行分片處理, 用于tcp_v4_send_synack()中.
ip_send_reply()是基于ip_build_xmit()的一個函數(shù),
用于tcp_v4_send_ack()和tcp_v4_send_reset()中.
?
(2) ip_build_xmit()使用用戶定義的回調函數(shù)直接讀取用戶數(shù)據(jù)片段生成IP包輸出.
如果需要分片,ip_build_xmit()按照最后一個片段到第一個片段的順序來生成IP包,
這是因為第一個IP包片段的數(shù)據(jù)區(qū)可能包含對整個IP包數(shù)據(jù)區(qū)的校驗碼,
在回調函數(shù)中用戶可能會計算輸出數(shù)據(jù)的校驗碼,
采用從后向前的輸出順序可使校驗碼自然地寫到第一個片段中.
?
(3) ip_queue_xmit()完成面向連接套接字輸出包的路由和IP包頭封裝. 當套接字處于連接狀態(tài)時,
所有從套接字發(fā)出的包都具有確定的路由, 無需為每一個輸出包查詢它的目的入口,
可將套接字直接綁定到路由入口上, 這由套接字的目的緩沖指針(dst_cache)來完成.
ip_queue_xmit()首先為輸入包建立IP包頭, 經過本地包過濾器后,
再將IP包分片輸出(ip_fragment), 如果需要的話.
?
(4) IP包生成器的輸出經過本地包過濾器后輸入包的路由入口, 對于點播地址來說,
輸入到IP輸出器中(ip_output); 對于廣播或同播地址來說, 輸入到IP同播輸出器(ip_mc_output).
在IP輸出器中, 再經過路由后過濾器,
進入路由的"鄰居"入口(dst->neighbour->output)或硬件幀頭緩沖入口(dst->hh->hh_output).
鄰居是指與主機自已在網絡接口設備層次上直達的相鄰主機.
鄰居負責解析輸出包的硬件投送地址, 將包投遞給相鄰的目的主機或網關主機.
當鄰居成功解析包的硬件投送地址時, 將在包的目的入口上創(chuàng)建硬件幀頭緩沖結構(dst->hh),
使得后繼包可以直接使用組裝好的幀頭, 直接將包傳遞給包調度器(dev_queue_xmit).
包調度器按照包的優(yōu)先級進行重排, 最后將包提交給設備驅動程序發(fā)送(dev->hard_start_xmit).
?
?
IP包生成接口
------------
; net/ipv4/ip_output.c:
?
int sysctl_ip_default_ttl = IPDEFTTL; 缺省的IP包生存期為64
?
?
int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,對包的數(shù)據(jù)體添加IP頭后直接輸出
???????????????????? ? u32 saddr, u32 daddr, struct ip_options *opt)
{
?????? struct rtable *rt = (struct rtable *)skb->dst;
?????? struct iphdr *iph;
?
??????
?????? if (opt)
????????????? iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
?????? else
????????????? iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
?
?????? iph->version? = 4;
?????? iph->ihl????? = 5;
?????? iph->tos????? = sk->protinfo.af_inet.tos;
?????? iph->frag_off = 0;
?????? if (ip_dont_fragment(sk, &rt->u.dst)) 如果IP包的目的入口禁止分片
????????????? iph->frag_off |= htons(IP_DF);
?????? iph->ttl????? = sk->protinfo.af_inet.ttl; 取套接字協(xié)議選項中的生存期
?????? iph->daddr??? = rt->rt_dst; 取IP包路由的目的地址
?????? iph->saddr??? = rt->rt_src; 取IP包路由的源地址
?????? iph->protocol = sk->protocol; 取套接字IP協(xié)議代碼
?????? iph->tot_len? = htons(skb->len); IP包總長度
?????? ip_select_ident(iph, &rt->u.dst); 為IP包分配標識號, 禁止分片的IP包標識為零
?????? skb->nh.iph?? = iph;
?
?????? if (opt && opt->optlen) {
????????????? iph->ihl += opt->optlen>>2;
????????????? ip_options_build(skb, opt, daddr, rt, 0); 設置IP選項區(qū)
?????? }
?????? ip_send_check(iph); 設置IP包頭的校驗和
?
??????
?????? return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
????????????? ?????? output_maybe_reroute); 過濾輸出并且目的路徑可能會被改變
}
?
int ip_build_xmit(struct sock *sk,
????????????? ? int getfrag (const void *,
???????????????????? ?????? char *,
???????????????????? ?????? unsigned int,????
???????????????????? ?????? unsigned int), 取數(shù)據(jù)片段的函數(shù)指針
????????????? ? const void *frag, 以上函數(shù)的調用參數(shù)
????????????? ? unsigned length,
????????????? ? struct ipcm_cookie *ipc, IP包配置信息
????????????? ? struct rtable *rt,
????????????? ? int flags) 從用戶數(shù)據(jù)建立IP包
{
?????? int err;
?????? struct sk_buff *skb;
?????? int df;
?????? struct iphdr *iph;
?
??????
?
?????? if (!sk->protinfo.af_inet.hdrincl) { 如果IP包頭不由用戶創(chuàng)建
????????????? length += sizeof(struct iphdr); 取IP包總長
?
?????????????
????????????? if (length > rt->u.dst.pmtu || ipc->opt != NULL) 如果包長度大于目的入口的最大片斷長
???????????????????? return ip_build_xmit_slow(sk,getfrag,frag,length,ipc,rt,flags);
?????? } else {
????????????? if (length > rt->u.dst.dev->mtu) { 如果包長大于目的入口設備的最大片段長
???????????????????? ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, rt->u.dst.dev->mtu);
???????????????????? return -EMSGSIZE;
????????????? }
?????? }
?????? if (flags&MSG_PROBE) 測試操作
????????????? goto out;
?
??????
?????? df = 0;
?????? if (ip_dont_fragment(sk, &rt->u.dst)) 如果禁止分片
????????????? df = htons(IP_DF);
?
??????
?????? {
?????? int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
?
?????? skb = sock_alloc_send_skb(sk, length+hh_len+15,
??????????????????????????? ? 0, flags&MSG_DONTWAIT, &err); 為套接字分配發(fā)送包
?????? if(skb==NULL)
????????????? goto error;
?????? skb_reserve(skb, hh_len); 保留硬件幀頭空間
?????? }
?
?????? skb->priority = sk->priority; 取套接字的優(yōu)先級
?????? skb->dst = dst_clone(&rt->u.dst); 取路由的目的入口
?
?????? skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
?
?????? if(!sk->protinfo.af_inet.hdrincl) {
????????????? iph->version=4;
????????????? iph->ihl=5;
????????????? iph->tos=sk->protinfo.af_inet.tos;
????????????? iph->tot_len = htons(length);
????????????? iph->frag_off = df;
????????????? iph->ttl=sk->protinfo.af_inet.mc_ttl;
????????????? ip_select_ident(iph, &rt->u.dst);
????????????? if (rt->rt_type != RTN_MULTICAST)
???????????????????? iph->ttl=sk->protinfo.af_inet.ttl;
????????????? iph->protocol=sk->protocol;?
????????????? iph->saddr=rt->rt_src;
????????????? iph->daddr=rt->rt_dst;
????????????? iph->check=0;
????????????? iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
????????????? err = getfrag(frag, ((char *)iph)+iph->ihl*4,0, length-iph->ihl*4);
????????????? ; 讀取用戶一片數(shù)據(jù)
?????? }
?????? else 如果IP包頭由用戶創(chuàng)建, 直接將用戶數(shù)據(jù)讀入IP頭所在位置
????????????? err = getfrag(frag, (void *)iph, 0, length);
?
?????? if (err)
????????????? goto error_fault;
?
?????? err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
????????????? ????? output_maybe_reroute);
?????? if (err > 0)
????????????? err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
?????? if (err)
????????????? goto error;
out:
?????? return 0;
?
error_fault:
?????? err = -EFAULT;
?????? kfree_skb(skb);
error:
?????? IP_INC_STATS(IpOutDiscards);
?????? return err;
}
static int ip_build_xmit_slow(struct sock *sk,
????????????? ? int getfrag (const void *,
???????????????????? ?????? char *,
???????????????????? ?????? unsigned int,????
???????????????????? ?????? unsigned int),
????????????? ? const void *frag,
????????????? ? unsigned length,
????????????? ? struct ipcm_cookie *ipc,
????????????? ? struct rtable *rt,
????????????? ? int flags) 建立IP選項區(qū)或者分片輸出
{
?????? unsigned int fraglen, maxfraglen, fragheaderlen;
?????? int err;
?????? int offset, mf;
?????? int mtu;
?????? u16 id = 0;
?
?????? int hh_len = (rt->u.dst.dev->hard_header_len + 15)&~15;
?????? int nfrags=0;
?????? struct ip_options *opt = ipc->opt;
?????? int df = 0;
?
?????? mtu = rt->u.dst.pmtu;
?????? if (ip_dont_fragment(sk, &rt->u.dst))
????????????? df = htons(IP_DF);
?
?????? length -= sizeof(struct iphdr);
?
?????? if (opt) {
????????????? fragheaderlen = sizeof(struct iphdr) + opt->optlen;
????????????? maxfraglen = ((mtu-sizeof(struct iphdr)-opt->optlen) & ~7) + fragheaderlen;
?????? } else {
?????? ?????? fragheaderlen = sizeof(struct iphdr);
?
?????????????
?
????????????? maxfraglen = ((mtu-sizeof(struct iphdr)) & ~7) + fragheaderlen;
?????? } 求最大IP包長
?
?????? if (length + fragheaderlen > 0xFFFF) {
????????????? ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
????????????? return -EMSGSIZE;
?????? }
?
??????
?
?????? offset = length - (length % (maxfraglen - fragheaderlen));取最后一個片段的數(shù)據(jù)偏移量
?
??????
?
?????? fraglen = length - offset + fragheaderlen; 求取后一個片段IP包全長
?
?????? if (length-offset==0) { 如果用戶數(shù)據(jù)恰好是最大單片數(shù)據(jù)長度的整數(shù)倍
????????????? fraglen = maxfraglen;
????????????? offset -= maxfraglen-fragheaderlen;
?????? }
?
??????
?
?????? mf = 0;
?
??????
?
?????? if (offset > 0 && sk->protinfo.af_inet.pmtudisc==IP_PMTUDISC_DO) {
????????????? ip_local_error(sk, EMSGSIZE, rt->rt_dst, sk->dport, mtu);
???????????? return -EMSGSIZE;
?????? }
?????? if (flags&MSG_PROBE)
????????????? goto out;
?
??????
?
?????? do {
????????????? char *data;
????????????? struct sk_buff * skb;
?
?????????????
?
????????????? skb = sock_alloc_send_skb(sk, fraglen+hh_len+15, 0, flags&MSG_DONTWAIT, &err);
????????????? if (skb == NULL)
???????????????????? goto error;
?
?????????????
?
????????????? skb->priority = sk->priority;
????????????? skb->dst = dst_clone(&rt->u.dst);
????????????? skb_reserve(skb, hh_len);
?
?????????????
?
????????????? data = skb_put(skb, fraglen);
????????????? skb->nh.iph = (struct iphdr *)data;
?
?????????????
?
????????????? {
???????????????????? struct iphdr *iph = (struct iphdr *)data;
?
???????????????????? iph->version = 4;
???????????????????? iph->ihl = 5;
???????????????????? if (opt) {
??????????????????????????? iph->ihl += opt->optlen>>2;
??????????????????????????? ip_options_build(skb, opt,
????????????????????????????????????????? ?ipc->addr, rt, offset);
???????????????????? }
???????????????????? iph->tos = sk->protinfo.af_inet.tos;
???????????????????? iph->tot_len = htons(fraglen - fragheaderlen + iph->ihl*4);
???????????????????? iph->frag_off = htons(offset>>3)|mf|df;
???????????????????? iph->id = id;
???????????????????? if (!mf) {
??????????????????????????? if (offset || !df) {
??????????????????????????????????
?????????????????????????????????? __ip_select_ident(iph, &rt->u.dst);
?????????????????????????????????? id = iph->id;
??????????????????????????? }
?
???????????????????????????
??????????????????????????? mf = htons(IP_MF);
???????????????????? }
???????????????????? if (rt->rt_type == RTN_MULTICAST)
??????????????????????????? iph->ttl = sk->protinfo.af_inet.mc_ttl;
???????????????????? else
??????????????????????????? iph->ttl = sk->protinfo.af_inet.ttl;
???????????????????? iph->protocol = sk->protocol;
???????????????????? iph->check = 0;
???????????????????? iph->saddr = rt->rt_src;
???????????????????? iph->daddr = rt->rt_dst;
???????????????????? iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
???????????????????? data += iph->ihl*4;
????????????? }
?
?????????????
?
????????????? if (getfrag(frag, data, offset, fraglen-fragheaderlen)) {
???????????????????? err = -EFAULT;
???????????????????? kfree_skb(skb);
???????????????????? goto error;
????????????? }
?
????????????? offset -= (maxfraglen-fragheaderlen); 片段從后向前進行分割, 是為了方便TCP包的校驗
????????????? fraglen = maxfraglen;
?
????????????? nfrags++;
?
????????????? err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL,
???????????????????? ????? skb->dst->dev, output_maybe_reroute);
????????????? if (err) {
???????????????????? if (err > 0)
??????????????????????????? err = sk->protinfo.af_inet.recverr ? net_xmit_errno(err) : 0;
???????????????????? if (err)
??????????????????????????? goto error;
????????????? }
?????? } while (offset >= 0);
?
?????? if (nfrags>1)
????????????? ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
out:
?????? return 0;
?
error:
?????? IP_INC_STATS(IpOutDiscards);
?????? if (nfrags>1)
????????????? ip_statistics[smp_processor_id()*2 + !in_softirq()].IpFragCreates += nfrags;
?????? return err;
}
?
?
void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
????????????? ?? unsigned int len)
{
?????? struct {
????????????? struct ip_options??? opt;
????????????? char?????????????? data[40]; 存放IP選項塊
?????? } replyopts;
?????? struct ipcm_cookie ipc;
?????? u32 daddr;
?????? struct rtable *rt = (struct rtable*)skb->dst;
?
?????? if (ip_options_echo(&replyopts.opt, skb)) 將包skb的IP選項刷新到replyopts結構中
????????????? return;
?
?????? daddr = ipc.addr = rt->rt_src;
?????? ipc.opt = NULL;
?
?????? if (replyopts.opt.optlen) {
????????????? ipc.opt = &replyopts.opt;
?
????????????? if (ipc.opt->srr)
???????????????????? daddr = replyopts.opt.faddr;
?????? }
?
?????? if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0))
????????????? return;
?
??????
?????? bh_lock_sock(sk);
?????? sk->protinfo.af_inet.tos = skb->nh.iph->tos;
?????? sk->priority = skb->priority;
?????? sk->protocol = skb->nh.iph->protocol;
?????? ip_build_xmit(sk, ip_reply_glue_bits, arg, len, &ipc, rt, MSG_DONTWAIT);
?????? bh_unlock_sock(sk);
?
?????? ip_rt_put(rt);
}
struct ip_reply_arg {
?????? struct iovec iov[2];??
?????? int????????? n_iov;???
?????? u32 ?????? ?????csum;
?????? int??? ???? csumoffset;
??????????????????????????? ?
};
?
static int ip_reply_glue_bits(const void *dptr, char *to, unsigned int offset,
???????????????????? ???? ?unsigned int fraglen)
{
????? struct ip_reply_arg *dp = (struct ip_reply_arg*)dptr;
?????? u16 *pktp = (u16 *)to;
?????? struct iovec *iov;
?????? int len;
?????? int hdrflag = 1;
?
?????? iov = &dp->iov[0];
?????? if (offset >= iov->iov_len) {
????????????? offset -= iov->iov_len;
????????????? iov++;
????????????? hdrflag = 0;
?????? }
?????? len = iov->iov_len - offset;
?????? if (fraglen > len) {
????????????? dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, len,
?????????????????????????????????? ???? dp->csum);
????????????? offset = 0;
????????????? fraglen -= len;
????????????? to += len;
????????????? iov++;
?????? }
?
?????? dp->csum = csum_partial_copy_nocheck(iov->iov_base+offset, to, fraglen,
?????????????????????????????????? ???? dp->csum);
?
?????? if (hdrflag && dp->csumoffset)
????????????? *(pktp + dp->csumoffset) = csum_fold(dp->csum);
?????? return 0;? ??????
}
?
int ip_queue_xmit(struct sk_buff *skb)
{
?????? struct sock *sk = skb->sk;
?????? struct ip_options *opt = sk->protinfo.af_inet.opt;
?????? struct rtable *rt;
?????? struct iphdr *iph;
?
??????
?????? rt = (struct rtable *)__sk_dst_check(sk, 0); 取套接字所緩沖的發(fā)送包的目的路由入口
?????? if (rt == NULL) { 如果尚未緩沖
????????????? u32 daddr;
?
?????????????
????????????? daddr = sk->daddr; 取套接字的對端地址作為目的地址
????????????? if(opt && opt->srr) 如果具有信源路由選項
???????????????????? daddr = opt->faddr; 取信源路由的轉發(fā)地址作為目的地址
?
?????????????
????????????? if (ip_route_output(&rt, daddr, sk->saddr,
??????????????????????????? ??? RT_TOS(sk->protinfo.af_inet.tos) | RTO_CONN | sk->localroute,
??????????????????????????? ??? sk->bound_dev_if)) 查詢目的地址的路由目的入口
???????????????????? goto no_route;
????????????? __sk_dst_set(sk, &rt->u.dst); 將該路由入口緩沖到套接字上
?????? }
?????? skb->dst = dst_clone(&rt->u.dst); 將路由入口綁定到發(fā)送包
?
?????? if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
????????????? goto no_route; 如果是指定嚴格信源路由并且其轉發(fā)地址不等于網關地址,則操作失敗
?
??????
?????? iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen :0));
?????? *((__u16 *)iph)????? = htons((4 << 12) | (5 << 8) | (sk->protinfo.af_inet.tos & 0xff));
?????? iph->tot_len = htons(skb->len);
?????? iph->frag_off = 0;
?????? iph->ttl????? = sk->protinfo.af_inet.ttl;
?????? iph->protocol = sk->protocol;
?????? iph->saddr??? = rt->rt_src;
?????? iph->daddr??? = rt->rt_dst;
?????? skb->nh.iph?? = iph;
??????
?
?????? if(opt && opt->optlen) { 建立IP選項區(qū)
????????????? iph->ihl += opt->optlen >> 2;
????????????? ip_options_build(skb, opt, sk->daddr, rt, 0);
?????? }
?
?????? return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
????????????? ?????? ip_queue_xmit2); 過濾輸出
?
no_route:
?????? IP_INC_STATS(IpOutNoRoutes);
?????? kfree_skb(skb);
?????? return -EHOSTUNREACH;
}
?
轉載于:https://www.cnblogs.com/jinrize/archive/2009/11/28/1612584.html
總結
以上是生活随笔為你收集整理的IP包的生成和发送接口(1)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: oracle opatch那个回退,下面
- 下一篇: 【2016Esri全球用户大会主题亮点】